Fist minimal whoosh search implemented

This commit is contained in:
Dirk Alders 2024-10-09 09:57:05 +02:00
parent 77f8f61aab
commit 30b817e359
11 changed files with 195 additions and 54 deletions

2
.gitignore vendored
View File

@ -2,6 +2,7 @@
data/media data/media
data/pages data/pages
data/static data/static
data/whoosh
db.sqlite3 db.sqlite3
config.py config.py
@ -156,4 +157,3 @@ pip-selfcheck.json
# .nfs files are created when an open file is removed but is still being accessed # .nfs files are created when an open file is removed but is still being accessed
.nfs* .nfs*

View File

@ -65,9 +65,8 @@ Now there are two ways to finalise your configuration. The first way is for a te
## Backup ## Backup
### Create Backup files ### Create Backup files
source venv/bin/activate source venv/bin/activate
python manage.py dumpdata --natural-foreign --natural-primary -e contenttypes -e sessions -e auth.Permission -e sessions -e patt --indent 2 > dump_base.json python manage.py dumpdata --natural-foreign --natural-primary -e contenttypes -e sessions -e auth.Permission -e sessions -e pages --indent 2 > dump_pages.json
python manage.py dumpdata --natural-foreign --natural-primary -e contenttypes -e sessions -e auth.Permission -e sessions piki --indent 2 > dump_piki.json tar -cvzf dump_data.tgz data/media data/pages data/media
tar -cvzf dump_data.tgz data/media data/pages
### Restore Backup ### Restore Backup
@ -79,6 +78,6 @@ If you are starting without a database, you need to create one
Afterward add data step by step to the database. Afterward add data step by step to the database.
python manage.py loaddata dump_base.json python manage.py loaddata dump_pages.json
python manage.py loaddata dump_patt.json rm -rf data/pages data/media
tar -xvzf dump_data.tgz tar -xvzf dump_data.tgz

View File

@ -11,3 +11,7 @@ def url_helpview(request, page):
def url_edit(request, rel_path): def url_edit(request, rel_path):
return reverse('page-edit', kwargs={'rel_path': rel_path}) return reverse('page-edit', kwargs={'rel_path': rel_path})
def get_search_query(request):
return request.GET.get('q')

View File

View File

View File

@ -0,0 +1,9 @@
from django.core.management.base import BaseCommand
from pages.search import create_index, rebuild_index
class Command(BaseCommand):
def handle(self, *args, **options):
ix = create_index()
n = rebuild_index(ix)
self.stdout.write(self.style.SUCCESS('Search index for %d items created.') % n)

View File

@ -1,69 +1,87 @@
from django.conf import settings from django.conf import settings
import fstools import fstools
import logging
from pages import messages, url_page from pages import messages, url_page
import mycreole import mycreole
import os import os
logger = logging.getLogger(settings.ROOT_LOGGER_NAME).getChild(__name__)
class creol_page(object):
SPLITCHAR = ":" class base_page(object):
FOLDER_ATTACHMENTS = "attachments"
FOLDER_CONTENT = 'content' FOLDER_CONTENT = 'content'
FILE_NAME = 'page' FILE_NAME = 'page'
SPLITCHAR = ":"
def __init__(self, request, rel_path) -> None: def __init__(self, path):
self._rel_path = rel_path if path.startswith(settings.PAGES_ROOT):
self._request = request self._path = path
else:
self._path = os.path.join(settings.PAGES_ROOT, path.replace("/", 2*self.SPLITCHAR))
self._raw_page_src = None
def _load_page_src(self):
if self._raw_page_src is None:
try:
with open(self.filename, 'r') as fh:
self._raw_page_src = fh.read()
except FileNotFoundError:
self._raw_page_src = ""
def update_page(self, page_txt):
from .search import update_item
#
folder = os.path.dirname(self.filename)
if not os.path.exists(folder):
fstools.mkdir(folder)
with open(self.filename, 'w') as fh:
fh.write(page_txt)
update_item(self)
@property
def filename(self):
return os.path.join(self._path, self.FOLDER_CONTENT, self.FILE_NAME)
@property
def rel_path(self):
return os.path.basename(self._path).replace(2*self.SPLITCHAR, "/")
def rel_path_is_valid(self): def rel_path_is_valid(self):
return not self.SPLITCHAR in self._rel_path return not self.SPLITCHAR in self.rel_path
def is_available(self): def is_available(self):
return os.path.isfile(self.content_file_name) is_a = os.path.isfile(self.filename)
if not is_a:
logger.info("page.is_available: Not available - %s", self.filename)
return is_a
@property @property
def title(self): def title(self):
return os.path.basename(self._rel_path) return os.path.basename(self._path).split("::")[-1]
@property
def attachment_path(self):
return os.path.join(self.content_folder_name, self.FOLDER_ATTACHMENTS)
def __content_folder_filter__(self, folder):
return folder.replace('/', '::')
def __folder_content_filter__(self, folder):
return folder.replace('::', '/')
@property
def content_folder_name(self):
return self.__content_folder_filter__(self._rel_path)
@property
def content_file_name(self):
return os.path.join(settings.PAGES_ROOT, self.content_folder_name, self.FOLDER_CONTENT, self.FILE_NAME)
@property @property
def raw_page_src(self): def raw_page_src(self):
try: self._load_page_src()
with open(self.content_file_name, 'r') as fh: return self._raw_page_src
return fh.read()
except FileNotFoundError:
return ""
def update_page(self, page_txt):
folder = os.path.dirname(self.content_file_name) class creole_page(base_page):
if not os.path.exists(folder): FOLDER_ATTACHMENTS = "attachments"
fstools.mkdir(folder)
with open(self.content_file_name, 'w') as fh: def __init__(self, request, path) -> None:
fh.write(page_txt) self._request = request
super().__init__(path)
@property
def attachment_path(self):
return os.path.join(os.path.basename(self._path), self.FOLDER_ATTACHMENTS)
def render_to_html(self): def render_to_html(self):
if self.is_available(): if self.is_available():
return self.render_text(self._request, self.raw_page_src) return self.render_text(self._request, self.raw_page_src)
else: else:
messages.unavailable_msg_page(self._request, self._rel_path) messages.unavailable_msg_page(self._request, self.rel_path)
return "" return ""
def render_text(self, request, txt): def render_text(self, request, txt):
@ -102,18 +120,18 @@ class creol_page(object):
# #
rv = "" rv = ""
# create a rel_path list # create a rel_path list
pathlist = [self.__folder_content_filter__(os.path.basename(path)) for path in fstools.dirlist(settings.PAGES_ROOT, rekursive=False)] pathlist = [base_page(path).rel_path for path in fstools.dirlist(settings.PAGES_ROOT, rekursive=False)]
# sort basename # sort basename
pathlist.sort(key=os.path.basename) pathlist.sort(key=os.path.basename)
last_char = None last_char = None
for contentname in pathlist: for contentname in pathlist:
# #
if (contentname.startswith(self._rel_path) or allpages) and contentname != self._rel_path: if (contentname.startswith(self.rel_path) or allpages) and contentname != self.rel_path:
if allpages: if allpages:
name = contentname name = contentname
else: else:
name = contentname[len(self._rel_path)+1:] name = contentname[len(self.rel_path)+1:]
if name.count('/') < depth and name.startswith(startname): if name.count('/') < depth and name.startswith(startname):
if last_char != os.path.basename(name)[0].upper(): if last_char != os.path.basename(name)[0].upper():
last_char = os.path.basename(name)[0].upper() last_char = os.path.basename(name)[0].upper()

96
pages/search.py Normal file
View File

@ -0,0 +1,96 @@
from django.conf import settings
import fstools
import logging
import os
from whoosh.fields import Schema, ID, TEXT
from whoosh.qparser.dateparse import DateParserPlugin
from whoosh import index, qparser
from pages.page import base_page
logger = logging.getLogger(settings.ROOT_LOGGER_NAME).getChild(__name__)
SCHEMA = Schema(
id=ID(unique=True, stored=True),
# Page
title=TEXT,
page_src=TEXT
)
def mk_whooshpath_if_needed():
if not os.path.exists(settings.WHOOSH_PATH):
fstools.mkdir(settings.WHOOSH_PATH)
def create_index():
mk_whooshpath_if_needed()
logger.debug('Search Index created.')
return index.create_in(settings.WHOOSH_PATH, schema=SCHEMA)
def rebuild_index(ix):
page_path = fstools.dirlist(settings.PAGES_ROOT, rekursive=False)
for path in page_path:
bp = base_page(path)
add_item(ix, bp)
return len(page_path)
def load_index():
mk_whooshpath_if_needed()
try:
ix = index.open_dir(settings.WHOOSH_PATH)
except index.EmptyIndexError:
ix = create_index()
else:
logger.debug('Search Index opened.')
return ix
def add_item(ix, bp: base_page):
# Define Standard data
#
data = dict(
id=bp.rel_path,
title=bp.title,
page_src=bp.raw_page_src
)
with ix.writer() as w:
logger.info('Adding document with id=%s to the search index.', data.get('id'))
w.add_document(**data)
for key in data:
logger.debug(' - Adding %s=%s', key, repr(data[key]))
def whoosh_search(search_txt):
ix = load_index()
qp = qparser.MultifieldParser(['title', 'page_src'], ix.schema)
qp.add_plugin(DateParserPlugin(free=True))
try:
q = qp.parse(search_txt)
except AttributeError:
return None
except Exception:
return None
with ix.searcher() as s:
results = s.search(q, limit=None)
rpl = []
for hit in results:
rpl.append(hit['id'])
return rpl
def delete_item(ix, bp: base_page):
with ix.writer() as w:
logger.info('Removing document with id=%s from the search index.', bp.rel_path)
w.delete_by_term("task_id", bp.rel_path)
def update_item(bp: base_page):
ix = load_index()
delete_item(ix, bp)
add_item(ix, bp)

View File

@ -8,12 +8,14 @@ import logging
from . import access from . import access
from . import messages from . import messages
from . import url_page from . import url_page
from . import get_search_query
import config import config
from .context import context_adaption from .context import context_adaption
from .forms import EditForm from .forms import EditForm
from .help import help_pages from .help import help_pages
import mycreole import mycreole
from .page import creol_page from .page import creole_page
from .search import whoosh_search
from themes import Context from themes import Context
logger = logging.getLogger(settings.ROOT_LOGGER_NAME).getChild(__name__) logger = logging.getLogger(settings.ROOT_LOGGER_NAME).getChild(__name__)
@ -26,7 +28,7 @@ def root(request):
def page(request, rel_path): def page(request, rel_path):
context = Context(request) # needs to be executed first because of time mesurement context = Context(request) # needs to be executed first because of time mesurement
# #
p = creol_page(request, rel_path) p = creole_page(request, rel_path)
if access.read_page(request, rel_path): if access.read_page(request, rel_path):
page_content = p.render_to_html() page_content = p.render_to_html()
else: else:
@ -48,7 +50,7 @@ def edit(request, rel_path):
if access.write_page(request, rel_path): if access.write_page(request, rel_path):
context = Context(request) # needs to be executed first because of time mesurement context = Context(request) # needs to be executed first because of time mesurement
# #
p = creol_page(request, rel_path) p = creole_page(request, rel_path)
# #
if not request.POST: if not request.POST:
form = EditForm(page_data=p.raw_page_src) form = EditForm(page_data=p.raw_page_src)
@ -92,10 +94,22 @@ def edit(request, rel_path):
def search(request): def search(request):
context = Context(request) # needs to be executed first because of time mesurement context = Context(request) # needs to be executed first because of time mesurement
#
search_txt = get_search_query(request)
sr = whoosh_search(search_txt)
if sr is None:
messages.error(request, _('Invalid search pattern: %s') % repr(search_txt))
sr = []
page_content = "= Searchresults\n"
for rel_path in sr:
p = creole_page(request, rel_path)
page_content += f"[[/page/{rel_path}|{p.title}]]\n"
#
context_adaption( context_adaption(
context, context,
request, request,
page_content="Search is not yet implemented..." page_content=mycreole.render_simple(page_content)
) )
return render(request, 'pages/page.html', context=context) return render(request, 'pages/page.html', context=context)

View File

@ -135,6 +135,8 @@ MYCREOLE_BAR = {
PAGES_ROOT = os.path.join(BASE_DIR, 'data', 'pages') PAGES_ROOT = os.path.join(BASE_DIR, 'data', 'pages')
WHOOSH_PATH = os.path.join(BASE_DIR, 'data', 'whoosh')
# Default primary key field type # Default primary key field type
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field # https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
@ -215,4 +217,3 @@ File "%(pathname)s", line %(lineno)d, in %(funcName)s
}, },
}, },
} }

View File

@ -2,4 +2,4 @@ Django
Pillow Pillow
python-creole python-creole
pytz pytz
Whoosh