xapian创建索引和搜索的简单范例程序,代码使用python实现
创建索引代码
import sys import os import errno from contextlib import closing import xapian as _x def main(files): # try to make a db in pwd try: os.mkdir('./xdb/') except (OSError, IOError), e: if e.errno != errno.EEXIST: raise with closing(_x.WritableDatabase('./xdb/sonnets.db', _x.DB_CREATE_OR_OPEN)) as x_db: # setup our indexer for f in files: with closing(open(f, 'r+')) as f: sonnet = f.read() num_lines = len(sonnet.split('\n')) author = 'William Shakespeare' # make a new document x_doc = _x.Document() # set sonnet text as data, and name as id x_id = 'Q%s' % f.name x_doc.set_data(sonnet) x_doc.add_term(x_id) # setup indexer indexer = _x.TermGenerator() indexer.set_stemmer(_x.Stem("english")) indexer.set_document(x_doc) # make author searchable in main text indexer.index_text(author) # do not keep going from author to text, seperate them indexer.increase_termpos() # index author into 'A' prefix, seperately indexer.index_text(author, 1, 'A') # index sonnet text indexer.index_text(sonnet) # add XLINES as number of lines x_doc.add_term('XLINES%s' % num_lines) # save x_db.replace_document(x_id, x_doc) if __name__ == '__main__': sys.exit(main(sys.argv[1:]))
查询代码
import sys from contextlib import closing import xapian as _x def _parseq(x_db, query, prefix=''): '''parse and return a QueryParser query''' qp = _x.QueryParser() stemmer = _x.Stem("english") qp.set_stemmer(stemmer) qp.set_database(x_db) qp.set_stemming_strategy(_x.QueryParser.STEM_SOME) return qp.parse_query(query, 0, prefix) def _joinq(op, first, sec): if not first: return sec return _x.Query(op, first, sec) def main(query, author_q, num_lines): x_query = None with closing(_x.Database('./xdb/sonnets.db')) as x_db: # setup the query if query: x_query = _x.Query(_parseq(x_db, query)) if author_q: x_query = _joinq(_x.Query.OP_AND, x_query, _parseq(x_db, query, 'A')) if num_lines: x_query = _joinq(_x.Query.OP_AND, x_query, _x.Query('XLINES%s' % num_lines.strip())) if not x_query: x_query = _x.Query() # setup the enquire object to perform the query enq = _x.Enquire(x_db) enq.set_query(x_query) for res in enq.get_mset(0, x_db.get_doccount(), None, None): print res.document.get_data() print if __name__ == '__main__': while len(sys.argv) < 4: sys.argv.append(None) sys.exit(main(*sys.argv[-3:]))