Browse Source

Fist minimal whoosh search implemented

master
Dirk Alders 2 months ago
parent
commit
30b817e359

+ 1
- 1
.gitignore View File

2
 data/media
2
 data/media
3
 data/pages
3
 data/pages
4
 data/static
4
 data/static
5
+data/whoosh
5
 db.sqlite3
6
 db.sqlite3
6
 config.py
7
 config.py
7
 
8
 
156
 
157
 
157
 # .nfs files are created when an open file is removed but is still being accessed
158
 # .nfs files are created when an open file is removed but is still being accessed
158
 .nfs*
159
 .nfs*
159
-

+ 4
- 5
README.md View File

65
 ## Backup
65
 ## Backup
66
 ### Create Backup files
66
 ### Create Backup files
67
     source venv/bin/activate
67
     source venv/bin/activate
68
-    python manage.py dumpdata --natural-foreign --natural-primary -e contenttypes -e sessions -e auth.Permission -e sessions -e patt --indent 2 > dump_base.json
69
-    python manage.py dumpdata --natural-foreign --natural-primary -e contenttypes -e sessions -e auth.Permission -e sessions piki --indent 2 > dump_piki.json
70
-    tar -cvzf dump_data.tgz data/media data/pages
68
+    python manage.py dumpdata --natural-foreign --natural-primary -e contenttypes -e sessions -e auth.Permission -e sessions -e pages --indent 2 > dump_pages.json
69
+    tar -cvzf dump_data.tgz data/media data/pages data/media
71
 
70
 
72
 ### Restore Backup
71
 ### Restore Backup
73
 
72
 
79
 
78
 
80
 Afterward add data step by step to the database.
79
 Afterward add data step by step to the database.
81
 
80
 
82
-    python manage.py loaddata dump_base.json
83
-    python manage.py loaddata dump_patt.json
81
+    python manage.py loaddata dump_pages.json
82
+    rm -rf data/pages data/media
84
     tar -xvzf dump_data.tgz
83
     tar -xvzf dump_data.tgz

+ 4
- 0
pages/__init__.py View File

11
 
11
 
12
 def url_edit(request, rel_path):
12
 def url_edit(request, rel_path):
13
     return reverse('page-edit', kwargs={'rel_path': rel_path})
13
     return reverse('page-edit', kwargs={'rel_path': rel_path})
14
+
15
+
16
+def get_search_query(request):
17
+    return request.GET.get('q')

+ 0
- 0
pages/management/__init__.py View File


+ 0
- 0
pages/management/commands/__init__.py View File


+ 9
- 0
pages/management/commands/rebuild_index.py View File

1
+from django.core.management.base import BaseCommand
2
+from pages.search import create_index, rebuild_index
3
+
4
+
5
+class Command(BaseCommand):
6
+    def handle(self, *args, **options):
7
+        ix = create_index()
8
+        n = rebuild_index(ix)
9
+        self.stdout.write(self.style.SUCCESS('Search index for %d items created.') % n)

+ 57
- 39
pages/page.py View File

1
 from django.conf import settings
1
 from django.conf import settings
2
 
2
 
3
 import fstools
3
 import fstools
4
+import logging
4
 from pages import messages, url_page
5
 from pages import messages, url_page
5
 import mycreole
6
 import mycreole
6
 import os
7
 import os
7
 
8
 
9
+logger = logging.getLogger(settings.ROOT_LOGGER_NAME).getChild(__name__)
8
 
10
 
9
-class creol_page(object):
10
-    SPLITCHAR = ":"
11
-    FOLDER_ATTACHMENTS = "attachments"
11
+
12
+class base_page(object):
12
     FOLDER_CONTENT = 'content'
13
     FOLDER_CONTENT = 'content'
13
     FILE_NAME = 'page'
14
     FILE_NAME = 'page'
15
+    SPLITCHAR = ":"
14
 
16
 
15
-    def __init__(self, request, rel_path) -> None:
16
-        self._rel_path = rel_path
17
-        self._request = request
17
+    def __init__(self, path):
18
+        if path.startswith(settings.PAGES_ROOT):
19
+            self._path = path
20
+        else:
21
+            self._path = os.path.join(settings.PAGES_ROOT, path.replace("/", 2*self.SPLITCHAR))
22
+        self._raw_page_src = None
18
 
23
 
19
-    def rel_path_is_valid(self):
20
-        return not self.SPLITCHAR in self._rel_path
24
+    def _load_page_src(self):
25
+        if self._raw_page_src is None:
26
+            try:
27
+                with open(self.filename, 'r') as fh:
28
+                    self._raw_page_src = fh.read()
29
+            except FileNotFoundError:
30
+                self._raw_page_src = ""
21
 
31
 
22
-    def is_available(self):
23
-        return os.path.isfile(self.content_file_name)
32
+    def update_page(self, page_txt):
33
+        from .search import update_item
34
+        #
35
+        folder = os.path.dirname(self.filename)
36
+        if not os.path.exists(folder):
37
+            fstools.mkdir(folder)
38
+        with open(self.filename, 'w') as fh:
39
+            fh.write(page_txt)
40
+        update_item(self)
24
 
41
 
25
     @property
42
     @property
26
-    def title(self):
27
-        return os.path.basename(self._rel_path)
43
+    def filename(self):
44
+        return os.path.join(self._path, self.FOLDER_CONTENT, self.FILE_NAME)
28
 
45
 
29
     @property
46
     @property
30
-    def attachment_path(self):
31
-        return os.path.join(self.content_folder_name, self.FOLDER_ATTACHMENTS)
32
-
33
-    def __content_folder_filter__(self, folder):
34
-        return folder.replace('/', '::')
47
+    def rel_path(self):
48
+        return os.path.basename(self._path).replace(2*self.SPLITCHAR, "/")
35
 
49
 
36
-    def __folder_content_filter__(self, folder):
37
-        return folder.replace('::', '/')
50
+    def rel_path_is_valid(self):
51
+        return not self.SPLITCHAR in self.rel_path
38
 
52
 
39
-    @property
40
-    def content_folder_name(self):
41
-        return self.__content_folder_filter__(self._rel_path)
53
+    def is_available(self):
54
+        is_a = os.path.isfile(self.filename)
55
+        if not is_a:
56
+            logger.info("page.is_available: Not available - %s", self.filename)
57
+        return is_a
42
 
58
 
43
     @property
59
     @property
44
-    def content_file_name(self):
45
-        return os.path.join(settings.PAGES_ROOT, self.content_folder_name, self.FOLDER_CONTENT, self.FILE_NAME)
60
+    def title(self):
61
+        return os.path.basename(self._path).split("::")[-1]
46
 
62
 
47
     @property
63
     @property
48
     def raw_page_src(self):
64
     def raw_page_src(self):
49
-        try:
50
-            with open(self.content_file_name, 'r') as fh:
51
-                return fh.read()
52
-        except FileNotFoundError:
53
-            return ""
65
+        self._load_page_src()
66
+        return self._raw_page_src
54
 
67
 
55
-    def update_page(self, page_txt):
56
-        folder = os.path.dirname(self.content_file_name)
57
-        if not os.path.exists(folder):
58
-            fstools.mkdir(folder)
59
-        with open(self.content_file_name, 'w') as fh:
60
-            fh.write(page_txt)
68
+
69
+class creole_page(base_page):
70
+    FOLDER_ATTACHMENTS = "attachments"
71
+
72
+    def __init__(self, request, path) -> None:
73
+        self._request = request
74
+        super().__init__(path)
75
+
76
+    @property
77
+    def attachment_path(self):
78
+        return os.path.join(os.path.basename(self._path), self.FOLDER_ATTACHMENTS)
61
 
79
 
62
     def render_to_html(self):
80
     def render_to_html(self):
63
         if self.is_available():
81
         if self.is_available():
64
             return self.render_text(self._request, self.raw_page_src)
82
             return self.render_text(self._request, self.raw_page_src)
65
         else:
83
         else:
66
-            messages.unavailable_msg_page(self._request, self._rel_path)
84
+            messages.unavailable_msg_page(self._request, self.rel_path)
67
             return ""
85
             return ""
68
 
86
 
69
     def render_text(self, request, txt):
87
     def render_text(self, request, txt):
102
         #
120
         #
103
         rv = ""
121
         rv = ""
104
         # create a rel_path list
122
         # create a rel_path list
105
-        pathlist = [self.__folder_content_filter__(os.path.basename(path)) for path in fstools.dirlist(settings.PAGES_ROOT, rekursive=False)]
123
+        pathlist = [base_page(path).rel_path for path in fstools.dirlist(settings.PAGES_ROOT, rekursive=False)]
106
         # sort basename
124
         # sort basename
107
         pathlist.sort(key=os.path.basename)
125
         pathlist.sort(key=os.path.basename)
108
 
126
 
109
         last_char = None
127
         last_char = None
110
         for contentname in pathlist:
128
         for contentname in pathlist:
111
             #
129
             #
112
-            if (contentname.startswith(self._rel_path) or allpages) and contentname != self._rel_path:
130
+            if (contentname.startswith(self.rel_path) or allpages) and contentname != self.rel_path:
113
                 if allpages:
131
                 if allpages:
114
                     name = contentname
132
                     name = contentname
115
                 else:
133
                 else:
116
-                    name = contentname[len(self._rel_path)+1:]
134
+                    name = contentname[len(self.rel_path)+1:]
117
                 if name.count('/') < depth and name.startswith(startname):
135
                 if name.count('/') < depth and name.startswith(startname):
118
                     if last_char != os.path.basename(name)[0].upper():
136
                     if last_char != os.path.basename(name)[0].upper():
119
                         last_char = os.path.basename(name)[0].upper()
137
                         last_char = os.path.basename(name)[0].upper()

+ 96
- 0
pages/search.py View File

1
+from django.conf import settings
2
+
3
+import fstools
4
+import logging
5
+import os
6
+from whoosh.fields import Schema, ID, TEXT
7
+from whoosh.qparser.dateparse import DateParserPlugin
8
+from whoosh import index, qparser
9
+
10
+from pages.page import base_page
11
+
12
+logger = logging.getLogger(settings.ROOT_LOGGER_NAME).getChild(__name__)
13
+
14
+
15
+SCHEMA = Schema(
16
+    id=ID(unique=True, stored=True),
17
+    # Page
18
+    title=TEXT,
19
+    page_src=TEXT
20
+)
21
+
22
+
23
+def mk_whooshpath_if_needed():
24
+    if not os.path.exists(settings.WHOOSH_PATH):
25
+        fstools.mkdir(settings.WHOOSH_PATH)
26
+
27
+
28
+def create_index():
29
+    mk_whooshpath_if_needed()
30
+    logger.debug('Search Index created.')
31
+    return index.create_in(settings.WHOOSH_PATH, schema=SCHEMA)
32
+
33
+
34
+def rebuild_index(ix):
35
+    page_path = fstools.dirlist(settings.PAGES_ROOT, rekursive=False)
36
+    for path in page_path:
37
+        bp = base_page(path)
38
+        add_item(ix, bp)
39
+    return len(page_path)
40
+
41
+
42
+def load_index():
43
+    mk_whooshpath_if_needed()
44
+    try:
45
+        ix = index.open_dir(settings.WHOOSH_PATH)
46
+    except index.EmptyIndexError:
47
+        ix = create_index()
48
+    else:
49
+        logger.debug('Search Index opened.')
50
+    return ix
51
+
52
+
53
+def add_item(ix, bp: base_page):
54
+    # Define Standard data
55
+    #
56
+    data = dict(
57
+        id=bp.rel_path,
58
+        title=bp.title,
59
+        page_src=bp.raw_page_src
60
+    )
61
+    with ix.writer() as w:
62
+        logger.info('Adding document with id=%s to the search index.', data.get('id'))
63
+        w.add_document(**data)
64
+        for key in data:
65
+            logger.debug('  - Adding %s=%s', key, repr(data[key]))
66
+
67
+
68
+def whoosh_search(search_txt):
69
+    ix = load_index()
70
+    qp = qparser.MultifieldParser(['title', 'page_src'], ix.schema)
71
+    qp.add_plugin(DateParserPlugin(free=True))
72
+    try:
73
+        q = qp.parse(search_txt)
74
+    except AttributeError:
75
+        return None
76
+    except Exception:
77
+        return None
78
+    with ix.searcher() as s:
79
+        results = s.search(q, limit=None)
80
+        rpl = []
81
+        for hit in results:
82
+            rpl.append(hit['id'])
83
+        return rpl
84
+
85
+
86
+def delete_item(ix, bp: base_page):
87
+    with ix.writer() as w:
88
+        logger.info('Removing document with id=%s from the search index.', bp.rel_path)
89
+        w.delete_by_term("task_id", bp.rel_path)
90
+
91
+
92
+def update_item(bp: base_page):
93
+    ix = load_index()
94
+    delete_item(ix, bp)
95
+    add_item(ix, bp)
96
+

+ 18
- 4
pages/views.py View File

8
 from . import access
8
 from . import access
9
 from . import messages
9
 from . import messages
10
 from . import url_page
10
 from . import url_page
11
+from . import get_search_query
11
 import config
12
 import config
12
 from .context import context_adaption
13
 from .context import context_adaption
13
 from .forms import EditForm
14
 from .forms import EditForm
14
 from .help import help_pages
15
 from .help import help_pages
15
 import mycreole
16
 import mycreole
16
-from .page import creol_page
17
+from .page import creole_page
18
+from .search import whoosh_search
17
 from themes import Context
19
 from themes import Context
18
 
20
 
19
 logger = logging.getLogger(settings.ROOT_LOGGER_NAME).getChild(__name__)
21
 logger = logging.getLogger(settings.ROOT_LOGGER_NAME).getChild(__name__)
26
 def page(request, rel_path):
28
 def page(request, rel_path):
27
     context = Context(request)      # needs to be executed first because of time mesurement
29
     context = Context(request)      # needs to be executed first because of time mesurement
28
     #
30
     #
29
-    p = creol_page(request, rel_path)
31
+    p = creole_page(request, rel_path)
30
     if access.read_page(request, rel_path):
32
     if access.read_page(request, rel_path):
31
         page_content = p.render_to_html()
33
         page_content = p.render_to_html()
32
     else:
34
     else:
48
     if access.write_page(request, rel_path):
50
     if access.write_page(request, rel_path):
49
         context = Context(request)      # needs to be executed first because of time mesurement
51
         context = Context(request)      # needs to be executed first because of time mesurement
50
         #
52
         #
51
-        p = creol_page(request, rel_path)
53
+        p = creole_page(request, rel_path)
52
         #
54
         #
53
         if not request.POST:
55
         if not request.POST:
54
             form = EditForm(page_data=p.raw_page_src)
56
             form = EditForm(page_data=p.raw_page_src)
92
 
94
 
93
 def search(request):
95
 def search(request):
94
     context = Context(request)      # needs to be executed first because of time mesurement
96
     context = Context(request)      # needs to be executed first because of time mesurement
97
+    #
98
+    search_txt = get_search_query(request)
99
+
100
+    sr = whoosh_search(search_txt)
101
+    if sr is None:
102
+        messages.error(request, _('Invalid search pattern: %s') % repr(search_txt))
103
+        sr = []
104
+    page_content = "= Searchresults\n"
105
+    for rel_path in sr:
106
+        p = creole_page(request, rel_path)
107
+        page_content += f"[[/page/{rel_path}|{p.title}]]\n"
108
+    #
95
     context_adaption(
109
     context_adaption(
96
         context,
110
         context,
97
         request,
111
         request,
98
-        page_content="Search is not yet implemented..."
112
+        page_content=mycreole.render_simple(page_content)
99
     )
113
     )
100
     return render(request, 'pages/page.html', context=context)
114
     return render(request, 'pages/page.html', context=context)
101
 
115
 

+ 2
- 1
piki/settings.py View File

135
 
135
 
136
 PAGES_ROOT = os.path.join(BASE_DIR, 'data', 'pages')
136
 PAGES_ROOT = os.path.join(BASE_DIR, 'data', 'pages')
137
 
137
 
138
+WHOOSH_PATH = os.path.join(BASE_DIR, 'data', 'whoosh')
139
+
138
 # Default primary key field type
140
 # Default primary key field type
139
 # https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
141
 # https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
140
 
142
 
215
         },
217
         },
216
     },
218
     },
217
 }
219
 }
218
-

+ 1
- 1
requirements.txt View File

2
 Pillow
2
 Pillow
3
 python-creole
3
 python-creole
4
 pytz
4
 pytz
5
-
5
+Whoosh

Loading…
Cancel
Save