xapian创建索引和搜索的简单范例程序,代码使用python实现
创建索引代码
import sys
import os
import errno
from contextlib import closing
import xapian as _x
def main(files):
# try to make a db in pwd
try:
os.mkdir('./xdb/')
except (OSError, IOError), e:
if e.errno != errno.EEXIST:
raise
with closing(_x.WritableDatabase('./xdb/sonnets.db',
_x.DB_CREATE_OR_OPEN)) as x_db:
# setup our indexer
for f in files:
with closing(open(f, 'r+')) as f:
sonnet = f.read()
num_lines = len(sonnet.split('\n'))
author = 'William Shakespeare'
# make a new document
x_doc = _x.Document()
# set sonnet text as data, and name as id
x_id = 'Q%s' % f.name
x_doc.set_data(sonnet)
x_doc.add_term(x_id)
# setup indexer
indexer = _x.TermGenerator()
indexer.set_stemmer(_x.Stem("english"))
indexer.set_document(x_doc)
# make author searchable in main text
indexer.index_text(author)
# do not keep going from author to text, seperate them
indexer.increase_termpos()
# index author into 'A' prefix, seperately
indexer.index_text(author, 1, 'A')
# index sonnet text
indexer.index_text(sonnet)
# add XLINES as number of lines
x_doc.add_term('XLINES%s' % num_lines)
# save
x_db.replace_document(x_id, x_doc)
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
查询代码
import sys
from contextlib import closing
import xapian as _x
def _parseq(x_db, query, prefix=''):
'''parse and return a QueryParser query'''
qp = _x.QueryParser()
stemmer = _x.Stem("english")
qp.set_stemmer(stemmer)
qp.set_database(x_db)
qp.set_stemming_strategy(_x.QueryParser.STEM_SOME)
return qp.parse_query(query, 0, prefix)
def _joinq(op, first, sec):
if not first:
return sec
return _x.Query(op, first, sec)
def main(query, author_q, num_lines):
x_query = None
with closing(_x.Database('./xdb/sonnets.db')) as x_db:
# setup the query
if query:
x_query = _x.Query(_parseq(x_db, query))
if author_q:
x_query = _joinq(_x.Query.OP_AND, x_query, _parseq(x_db, query, 'A'))
if num_lines:
x_query = _joinq(_x.Query.OP_AND, x_query,
_x.Query('XLINES%s' % num_lines.strip()))
if not x_query:
x_query = _x.Query()
# setup the enquire object to perform the query
enq = _x.Enquire(x_db)
enq.set_query(x_query)
for res in enq.get_mset(0, x_db.get_doccount(), None, None):
print res.document.get_data()
print
if __name__ == '__main__':
while len(sys.argv) < 4:
sys.argv.append(None)
sys.exit(main(*sys.argv[-3:]))
