Source code for whoosh.lang.wordnet

# Copyright 2009 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

"""This module contains low-level functions and a high-level class for parsing
the prolog file "wn_s.pl" from the WordNet prolog download
into an object suitable for looking up synonyms and performing query expansion.

http://wordnetcode.princeton.edu/3.0/WNprolog-3.0.tar.gz
"""

from collections import defaultdict

from whoosh.fields import ID, STORED, Schema


[docs]def parse_file(f): """Parses the WordNet wn_s.pl prolog file and returns two dictionaries: word2nums and num2words. """ word2nums = defaultdict(list) num2words = defaultdict(list) for line in f: if not line.startswith("s("): continue line = line[2:] num = int(line[: line.find(",")]) qt = line.find("'") line = line[qt + 1 :] qt = line.find("'") word = line[:qt].lower() if not word.isalpha(): continue word2nums[word].append(num) num2words[num].append(word) return word2nums, num2words
[docs]def make_index(storage, indexname, word2nums, num2words): """Creates a Whoosh index in the given storage object containing synonyms taken from word2nums and num2words. Returns the Index object. """ schema = Schema(word=ID, syns=STORED) ix = storage.create_index(schema, indexname=indexname) w = ix.writer() for word in word2nums.keys(): syns = synonyms(word2nums, num2words, word) w.add_document(word=str(word), syns=syns) w.commit() return ix
[docs]def synonyms(word2nums, num2words, word): """Uses the word2nums and num2words dicts to look up synonyms for the given word. Returns a list of synonym strings. """ keys = word2nums[word] syns = set() for key in keys: syns = syns.union(num2words[key]) if word in syns: syns.remove(word) return sorted(syns)
[docs]class Thesaurus: """Represents the WordNet synonym database, either loaded into memory from the wn_s.pl Prolog file, or stored on disk in a Whoosh index. This class allows you to parse the prolog file "wn_s.pl" from the WordNet prolog download into an object suitable for looking up synonyms and performing query expansion. http://wordnetcode.princeton.edu/3.0/WNprolog-3.0.tar.gz To load a Thesaurus object from the wn_s.pl file... >>> t = Thesaurus.from_filename("wn_s.pl") To save the in-memory Thesaurus to a Whoosh index... >>> from whoosh.filedb.filestore import FileStorage >>> fs = FileStorage("index") >>> t.to_storage(fs) To load a Thesaurus object from a Whoosh index... >>> t = Thesaurus.from_storage(fs) The Thesaurus object is thus usable in two ways: * Parse the wn_s.pl file into memory (Thesaurus.from_*) and then look up synonyms in memory. This has a startup cost for parsing the file, and uses quite a bit of memory to store two large dictionaries, however synonym look-ups are very fast. * Parse the wn_s.pl file into memory (Thesaurus.from_filename) then save it to an index (to_storage). From then on, open the thesaurus from the saved index (Thesaurus.from_storage). This has a large cost for storing the index, but after that it is faster to open the Thesaurus (than re-parsing the file) but slightly slower to look up synonyms. Here are timings for various tasks on my (fast) Windows machine, which might give an idea of relative costs for in-memory vs. on-disk. ================================================ ================ Task Approx. time (s) ================================================ ================ Parsing the wn_s.pl file 1.045 Saving to an on-disk index 13.084 Loading from an on-disk index 0.082 Look up synonyms for "light" (in memory) 0.0011 Look up synonyms for "light" (loaded from disk) 0.0028 ================================================ ================ Basically, if you can afford spending the memory necessary to parse the Thesaurus and then cache it, it's faster. Otherwise, use an on-disk index. """ def __init__(self): self.w2n = None self.n2w = None self.searcher = None
[docs] @classmethod def from_file(cls, fileobj): """Creates a Thesaurus object from the given file-like object, which should contain the WordNet wn_s.pl file. >>> f = open("wn_s.pl") >>> t = Thesaurus.from_file(f) >>> t.synonyms("hail") ['acclaim', 'come', 'herald'] """ thes = cls() thes.w2n, thes.n2w = parse_file(fileobj) return thes
[docs] @classmethod def from_filename(cls, filename): """Creates a Thesaurus object from the given filename, which should contain the WordNet wn_s.pl file. >>> t = Thesaurus.from_filename("wn_s.pl") >>> t.synonyms("hail") ['acclaim', 'come', 'herald'] """ f = open(filename, "rb") try: return cls.from_file(f) finally: f.close()
[docs] @classmethod def from_storage(cls, storage, indexname="THES"): """Creates a Thesaurus object from the given storage object, which should contain an index created by Thesaurus.to_storage(). >>> from whoosh.filedb.filestore import FileStorage >>> fs = FileStorage("index") >>> t = Thesaurus.from_storage(fs) >>> t.synonyms("hail") ['acclaim', 'come', 'herald'] :param storage: A :class:`whoosh.store.Storage` object from which to load the index. :param indexname: A name for the index. This allows you to store multiple indexes in the same storage object. """ thes = cls() index = storage.open_index(indexname=indexname) thes.searcher = index.searcher() return thes
[docs] def to_storage(self, storage, indexname="THES"): """Creates am index in the given storage object from the synonyms loaded from a WordNet file. >>> from whoosh.filedb.filestore import FileStorage >>> fs = FileStorage("index") >>> t = Thesaurus.from_filename("wn_s.pl") >>> t.to_storage(fs) :param storage: A :class:`whoosh.store.Storage` object in which to save the index. :param indexname: A name for the index. This allows you to store multiple indexes in the same storage object. """ if not self.w2n or not self.n2w: raise Exception("No synonyms loaded") make_index(storage, indexname, self.w2n, self.n2w)
[docs] def synonyms(self, word): """Returns a list of synonyms for the given word. >>> thesaurus.synonyms("hail") ['acclaim', 'come', 'herald'] """ word = word.lower() if self.searcher: return self.searcher.document(word=word)["syns"] else: return synonyms(self.w2n, self.n2w, word)