Source code for whoosh.query.nested

# Copyright 2012 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

from whoosh import matching
from whoosh.query import qcore
from whoosh.query.wrappers import WrappingQuery


[docs]class NestedParent(WrappingQuery):
    """A query that allows you to search for "nested" documents, where you can
    index (possibly multiple levels of) "parent" and "child" documents using
    the :meth:`~whoosh.writing.IndexWriter.group` and/or
    :meth:`~whoosh.writing.IndexWriter.start_group` methods of a
    :class:`whoosh.writing.IndexWriter` to indicate that hierarchically related
    documents should be kept together::

        schema = fields.Schema(type=fields.ID, text=fields.TEXT(stored=True))

        with ix.writer() as w:
            # Say we're indexing chapters (type=chap) and each chapter has a
            # number of paragraphs (type=p)
            with w.group():
                w.add_document(type="chap", text="Chapter 1")
                w.add_document(type="p", text="Able baker")
                w.add_document(type="p", text="Bright morning")
            with w.group():
                w.add_document(type="chap", text="Chapter 2")
                w.add_document(type="p", text="Car trip")
                w.add_document(type="p", text="Dog eared")
                w.add_document(type="p", text="Every day")
            with w.group():
                w.add_document(type="chap", text="Chapter 3")
                w.add_document(type="p", text="Fine day")

    The ``NestedParent`` query wraps two sub-queries: the "parent query"
    matches a class of "parent documents". The "sub query" matches nested
    documents you want to find. For each "sub document" the "sub query" finds,
    this query acts as if it found the corresponding "parent document".

    >>> with ix.searcher() as s:
    ...   r = s.search(query.Term("text", "day"))
    ...   for hit in r:
    ...     print(hit["text"])
    ...
    Chapter 2
    Chapter 3
    """

    def __init__(self, parents, subq, per_parent_limit=None, score_fn=sum):
        """
        :param parents: a query, DocIdSet object, or Results object
            representing the documents you want to use as the "parent"
            documents. Where the sub-query matches, the corresponding document
            in these results will be returned as the match.
        :param subq: a query matching the information you want to find.
        :param per_parent_limit: a maximum number of "sub documents" to search
            per parent. The default is None, meaning no limit.
        :param score_fn: a function to use to combine the scores of matching
            sub-documents to calculate the score returned for the parent
            document. The default is ``sum``, that is, add up the scores of the
            sub-documents.
        """

        self.parents = parents
        self.child = subq
        self.per_parent_limit = per_parent_limit
        self.score_fn = score_fn

    def normalize(self):
        p = self.parents
        if isinstance(p, qcore.Query):
            p = p.normalize()
        q = self.child.normalize()

        if p is qcore.NullQuery or q is qcore.NullQuery:
            return qcore.NullQuery

        return self.__class__(p, q)

    def requires(self):
        return self.child.requires()

    def matcher(self, searcher, context=None):
        bits = searcher._filter_to_comb(self.parents)
        if not bits:
            return matching.NullMatcher
        m = self.child.matcher(searcher, context)
        if not m.is_active():
            return matching.NullMatcher

        return self.NestedParentMatcher(
            bits, m, self.per_parent_limit, searcher.doc_count_all(), self.score_fn
        )

    def deletion_docs(self, searcher):
        bits = searcher._filter_to_comb(self.parents)
        if not bits:
            return

        m = self.child.matcher(searcher, searcher.boolean_context())
        maxdoc = searcher.doc_count_all()
        while m.is_active():
            docnum = m.id()
            parentdoc = bits.before(docnum + 1)
            nextparent = bits.after(docnum) or maxdoc
            yield from range(parentdoc, nextparent)
            m.skip_to(nextparent)

    class NestedParentMatcher(matching.Matcher):
        def __init__(self, comb, child, per_parent_limit, maxdoc, score_fn):
            self.comb = comb
            self.child = child
            self.per_parent_limit = per_parent_limit
            self.maxdoc = maxdoc
            self.score_fn = score_fn

            self._nextdoc = None
            if self.child.is_active():
                self._gather()

        def is_active(self):
            return self._nextdoc is not None

        def supports_block_quality(self):
            return False

        def _gather(self):
            # This is where the magic happens ;)
            child = self.child
            pplimit = self.per_parent_limit

            # The next document returned by this matcher is the parent of the
            # child's current document. We don't have to worry about whether
            # the parent is deleted, because the query that gave us the parents
            # wouldn't return deleted documents.
            self._nextdoc = self.comb.before(child.id() + 1)
            # The next parent after the child matcher's current document
            nextparent = self.comb.after(child.id()) or self.maxdoc

            # Sum the scores of all matching documents under the parent
            count = 1
            scores = []
            while child.is_active() and child.id() < nextparent:
                if pplimit and count > pplimit:
                    child.skip_to(nextparent)
                    break

                scores.append(child.score())
                child.next()
                count += 1

            score = self.score_fn(scores) if scores else 0
            self._nextscore = score

        def id(self):
            return self._nextdoc

        def score(self):
            return self._nextscore

        def reset(self):
            self.child.reset()
            self._gather()

        def next(self):
            if self.child.is_active():
                self._gather()
            else:
                if self._nextdoc is None:
                    raise matching.ReadTooFar
                else:
                    self._nextdoc = None

        def skip_to(self, id):
            self.child.skip_to(id)
            self._gather()

        def value(self):
            raise NotImplementedError(self.__class__)

        def spans(self):
            return []


[docs]class NestedChildren(WrappingQuery):
    """This is the reverse of a :class:`NestedParent` query: instead of taking
    a query that matches children but returns the parent, this query matches
    parents but returns the children.

    This is useful, for example, to search for an album title and return the
    songs in the album::

        schema = fields.Schema(type=fields.ID(stored=True),
                               album_name=fields.TEXT(stored=True),
                               track_num=fields.NUMERIC(stored=True),
                               track_name=fields.TEXT(stored=True),
                               lyrics=fields.TEXT)
        ix = RamStorage().create_index(schema)

        # Indexing
        with ix.writer() as w:
            # For each album, index a "group" of a parent "album" document and
            # multiple child "track" documents.
            with w.group():
                w.add_document(type="album",
                               artist="The Cure", album_name="Disintegration")
                w.add_document(type="track", track_num=1,
                               track_name="Plainsong")
                w.add_document(type="track", track_num=2,
                               track_name="Pictures of You")
                # ...
            # ...


        # Find songs where the song name has "heaven" in the title and the
        # album the song is on has "hell" in the title
        qp = QueryParser("lyrics", ix.schema)
        with ix.searcher() as s:
            # A query that matches all parents
            all_albums = qp.parse("type:album")

            # A query that matches the parents we want
            albums_with_hell = qp.parse("album_name:hell")

            # A query that matches the desired albums but returns the tracks
            songs_on_hell_albums = NestedChildren(all_albums, albums_with_hell)

            # A query that matches tracks with heaven in the title
            songs_with_heaven = qp.parse("track_name:heaven")

            # A query that finds tracks with heaven in the title on albums
            # with hell in the title
            q = query.And([songs_on_hell_albums, songs_with_heaven])

    """

    def __init__(self, parents, subq, boost=1.0):
        self.parents = parents
        self.child = subq
        self.boost = boost

    def matcher(self, searcher, context=None):
        bits = searcher._filter_to_comb(self.parents)
        if not bits:
            return matching.NullMatcher

        m = self.child.matcher(searcher, context)
        if not m.is_active():
            return matching.NullMatcher

        return self.NestedChildMatcher(
            bits,
            m,
            searcher.doc_count_all(),
            searcher.reader().is_deleted,
            boost=self.boost,
        )

    class NestedChildMatcher(matching.WrappingMatcher):
        def __init__(
            self, parent_comb, wanted_parent_matcher, limit, is_deleted, boost=1.0
        ):
            self.parent_comb = parent_comb
            self.child = wanted_parent_matcher
            self.limit = limit
            self.is_deleted = is_deleted
            self.boost = boost
            self._nextchild = -1
            self._nextparent = -1
            self._find_next_children()

        def __repr__(self):
            return f"{self.__class__.__name__}({self.parent_comb!r}, {self.child!r})"

        def reset(self):
            self.child.reset()
            self._reset()

        def _reset(self):
            self._nextchild = -1
            self._nextparent = -1
            self._find_next_children()

        def is_active(self):
            return self._nextchild < self._nextparent

        def replace(self, minquality=0):
            return self

        def _find_next_children(self):
            # "comb" contains the doc IDs of all parent documents
            comb = self.parent_comb
            # "m" is the matcher for "wanted" parents
            m = self.child
            # Last doc ID + 1
            limit = self.limit
            # A function that returns True if a doc ID is deleted
            is_deleted = self.is_deleted
            nextchild = self._nextchild
            nextparent = self._nextparent

            while m.is_active():
                # Move the "child id" to the document after the current match
                nextchild = m.id() + 1
                # Move the parent matcher to the next match
                m.next()

                # Find the next parent document (matching or not) after this
                nextparent = comb.after(nextchild)
                if nextparent is None:
                    nextparent = limit

                # Skip any deleted child documents
                while is_deleted(nextchild):
                    nextchild += 1

                # If skipping deleted documents put us to or past the next
                # parent doc, go again
                if nextchild >= nextparent:
                    continue
                else:
                    # Otherwise, we're done
                    break

            self._nextchild = nextchild
            self._nextparent = nextparent

        def id(self):
            return self._nextchild

        def all_ids(self):
            while self.is_active():
                yield self.id()
                self.next()

        def next(self):
            is_deleted = self.is_deleted
            limit = self.limit
            nextparent = self._nextparent

            # Go to the next document
            nextchild = self._nextchild
            nextchild += 1

            # Skip over any deleted child documents
            while nextchild < nextparent and is_deleted(nextchild):
                nextchild += 1

            self._nextchild = nextchild
            # If we're at or past the next parent doc, go to the next set of
            # children
            if nextchild >= limit:
                return
            elif nextchild >= nextparent:
                self._find_next_children()

        def skip_to(self, docid):
            comb = self.parent_comb
            wanted = self.child

            # self._nextchild is the "current" matching child ID
            if docid <= self._nextchild:
                return

            # self._nextparent is the next parent ID (matching or not)
            if docid < self._nextparent:
                # Just iterate
                while self.is_active() and self.id() < docid:
                    self.next()
            elif wanted.is_active():
                # Find the parent before the target ID
                pid = comb.before(docid)
                # Skip the parent matcher to that ID
                wanted.skip_to(pid)
                # If that made the matcher inactive, then we're done
                if not wanted.is_active():
                    self._nextchild = self._nextparent = self.limit
                else:
                    # Reestablish for the next child after the next matching
                    # parent
                    self._find_next_children()
            else:
                self._nextchild = self._nextparent = self.limit

        def value(self):
            raise NotImplementedError(self.__class__)

        def score(self):
            return self.boost

        def spans(self):
            return []