Procházet zdrojové kódy

Fist minimal whoosh search implemented

master
Dirk Alders před 3 měsíci
rodič
revize
30b817e359

+ 1
- 1
.gitignore Zobrazit soubor

@@ -2,6 +2,7 @@
2 2
 data/media
3 3
 data/pages
4 4
 data/static
5
+data/whoosh
5 6
 db.sqlite3
6 7
 config.py
7 8
 
@@ -156,4 +157,3 @@ pip-selfcheck.json
156 157
 
157 158
 # .nfs files are created when an open file is removed but is still being accessed
158 159
 .nfs*
159
-

+ 4
- 5
README.md Zobrazit soubor

@@ -65,9 +65,8 @@ Now there are two ways to finalise your configuration. The first way is for a te
65 65
 ## Backup
66 66
 ### Create Backup files
67 67
     source venv/bin/activate
68
-    python manage.py dumpdata --natural-foreign --natural-primary -e contenttypes -e sessions -e auth.Permission -e sessions -e patt --indent 2 > dump_base.json
69
-    python manage.py dumpdata --natural-foreign --natural-primary -e contenttypes -e sessions -e auth.Permission -e sessions piki --indent 2 > dump_piki.json
70
-    tar -cvzf dump_data.tgz data/media data/pages
68
+    python manage.py dumpdata --natural-foreign --natural-primary -e contenttypes -e sessions -e auth.Permission -e sessions -e pages --indent 2 > dump_pages.json
69
+    tar -cvzf dump_data.tgz data/media data/pages data/media
71 70
 
72 71
 ### Restore Backup
73 72
 
@@ -79,6 +78,6 @@ If you are starting without a database, you need to create one
79 78
 
80 79
 Afterward add data step by step to the database.
81 80
 
82
-    python manage.py loaddata dump_base.json
83
-    python manage.py loaddata dump_patt.json
81
+    python manage.py loaddata dump_pages.json
82
+    rm -rf data/pages data/media
84 83
     tar -xvzf dump_data.tgz

+ 4
- 0
pages/__init__.py Zobrazit soubor

@@ -11,3 +11,7 @@ def url_helpview(request, page):
11 11
 
12 12
 def url_edit(request, rel_path):
13 13
     return reverse('page-edit', kwargs={'rel_path': rel_path})
14
+
15
+
16
+def get_search_query(request):
17
+    return request.GET.get('q')

+ 0
- 0
pages/management/__init__.py Zobrazit soubor


+ 0
- 0
pages/management/commands/__init__.py Zobrazit soubor


+ 9
- 0
pages/management/commands/rebuild_index.py Zobrazit soubor

@@ -0,0 +1,9 @@
1
+from django.core.management.base import BaseCommand
2
+from pages.search import create_index, rebuild_index
3
+
4
+
5
+class Command(BaseCommand):
6
+    def handle(self, *args, **options):
7
+        ix = create_index()
8
+        n = rebuild_index(ix)
9
+        self.stdout.write(self.style.SUCCESS('Search index for %d items created.') % n)

+ 57
- 39
pages/page.py Zobrazit soubor

@@ -1,69 +1,87 @@
1 1
 from django.conf import settings
2 2
 
3 3
 import fstools
4
+import logging
4 5
 from pages import messages, url_page
5 6
 import mycreole
6 7
 import os
7 8
 
9
+logger = logging.getLogger(settings.ROOT_LOGGER_NAME).getChild(__name__)
8 10
 
9
-class creol_page(object):
10
-    SPLITCHAR = ":"
11
-    FOLDER_ATTACHMENTS = "attachments"
11
+
12
+class base_page(object):
12 13
     FOLDER_CONTENT = 'content'
13 14
     FILE_NAME = 'page'
15
+    SPLITCHAR = ":"
14 16
 
15
-    def __init__(self, request, rel_path) -> None:
16
-        self._rel_path = rel_path
17
-        self._request = request
17
+    def __init__(self, path):
18
+        if path.startswith(settings.PAGES_ROOT):
19
+            self._path = path
20
+        else:
21
+            self._path = os.path.join(settings.PAGES_ROOT, path.replace("/", 2*self.SPLITCHAR))
22
+        self._raw_page_src = None
18 23
 
19
-    def rel_path_is_valid(self):
20
-        return not self.SPLITCHAR in self._rel_path
24
+    def _load_page_src(self):
25
+        if self._raw_page_src is None:
26
+            try:
27
+                with open(self.filename, 'r') as fh:
28
+                    self._raw_page_src = fh.read()
29
+            except FileNotFoundError:
30
+                self._raw_page_src = ""
21 31
 
22
-    def is_available(self):
23
-        return os.path.isfile(self.content_file_name)
32
+    def update_page(self, page_txt):
33
+        from .search import update_item
34
+        #
35
+        folder = os.path.dirname(self.filename)
36
+        if not os.path.exists(folder):
37
+            fstools.mkdir(folder)
38
+        with open(self.filename, 'w') as fh:
39
+            fh.write(page_txt)
40
+        update_item(self)
24 41
 
25 42
     @property
26
-    def title(self):
27
-        return os.path.basename(self._rel_path)
43
+    def filename(self):
44
+        return os.path.join(self._path, self.FOLDER_CONTENT, self.FILE_NAME)
28 45
 
29 46
     @property
30
-    def attachment_path(self):
31
-        return os.path.join(self.content_folder_name, self.FOLDER_ATTACHMENTS)
32
-
33
-    def __content_folder_filter__(self, folder):
34
-        return folder.replace('/', '::')
47
+    def rel_path(self):
48
+        return os.path.basename(self._path).replace(2*self.SPLITCHAR, "/")
35 49
 
36
-    def __folder_content_filter__(self, folder):
37
-        return folder.replace('::', '/')
50
+    def rel_path_is_valid(self):
51
+        return not self.SPLITCHAR in self.rel_path
38 52
 
39
-    @property
40
-    def content_folder_name(self):
41
-        return self.__content_folder_filter__(self._rel_path)
53
+    def is_available(self):
54
+        is_a = os.path.isfile(self.filename)
55
+        if not is_a:
56
+            logger.info("page.is_available: Not available - %s", self.filename)
57
+        return is_a
42 58
 
43 59
     @property
44
-    def content_file_name(self):
45
-        return os.path.join(settings.PAGES_ROOT, self.content_folder_name, self.FOLDER_CONTENT, self.FILE_NAME)
60
+    def title(self):
61
+        return os.path.basename(self._path).split("::")[-1]
46 62
 
47 63
     @property
48 64
     def raw_page_src(self):
49
-        try:
50
-            with open(self.content_file_name, 'r') as fh:
51
-                return fh.read()
52
-        except FileNotFoundError:
53
-            return ""
65
+        self._load_page_src()
66
+        return self._raw_page_src
54 67
 
55
-    def update_page(self, page_txt):
56
-        folder = os.path.dirname(self.content_file_name)
57
-        if not os.path.exists(folder):
58
-            fstools.mkdir(folder)
59
-        with open(self.content_file_name, 'w') as fh:
60
-            fh.write(page_txt)
68
+
69
+class creole_page(base_page):
70
+    FOLDER_ATTACHMENTS = "attachments"
71
+
72
+    def __init__(self, request, path) -> None:
73
+        self._request = request
74
+        super().__init__(path)
75
+
76
+    @property
77
+    def attachment_path(self):
78
+        return os.path.join(os.path.basename(self._path), self.FOLDER_ATTACHMENTS)
61 79
 
62 80
     def render_to_html(self):
63 81
         if self.is_available():
64 82
             return self.render_text(self._request, self.raw_page_src)
65 83
         else:
66
-            messages.unavailable_msg_page(self._request, self._rel_path)
84
+            messages.unavailable_msg_page(self._request, self.rel_path)
67 85
             return ""
68 86
 
69 87
     def render_text(self, request, txt):
@@ -102,18 +120,18 @@ class creol_page(object):
102 120
         #
103 121
         rv = ""
104 122
         # create a rel_path list
105
-        pathlist = [self.__folder_content_filter__(os.path.basename(path)) for path in fstools.dirlist(settings.PAGES_ROOT, rekursive=False)]
123
+        pathlist = [base_page(path).rel_path for path in fstools.dirlist(settings.PAGES_ROOT, rekursive=False)]
106 124
         # sort basename
107 125
         pathlist.sort(key=os.path.basename)
108 126
 
109 127
         last_char = None
110 128
         for contentname in pathlist:
111 129
             #
112
-            if (contentname.startswith(self._rel_path) or allpages) and contentname != self._rel_path:
130
+            if (contentname.startswith(self.rel_path) or allpages) and contentname != self.rel_path:
113 131
                 if allpages:
114 132
                     name = contentname
115 133
                 else:
116
-                    name = contentname[len(self._rel_path)+1:]
134
+                    name = contentname[len(self.rel_path)+1:]
117 135
                 if name.count('/') < depth and name.startswith(startname):
118 136
                     if last_char != os.path.basename(name)[0].upper():
119 137
                         last_char = os.path.basename(name)[0].upper()

+ 96
- 0
pages/search.py Zobrazit soubor

@@ -0,0 +1,96 @@
1
+from django.conf import settings
2
+
3
+import fstools
4
+import logging
5
+import os
6
+from whoosh.fields import Schema, ID, TEXT
7
+from whoosh.qparser.dateparse import DateParserPlugin
8
+from whoosh import index, qparser
9
+
10
+from pages.page import base_page
11
+
12
+logger = logging.getLogger(settings.ROOT_LOGGER_NAME).getChild(__name__)
13
+
14
+
15
+SCHEMA = Schema(
16
+    id=ID(unique=True, stored=True),
17
+    # Page
18
+    title=TEXT,
19
+    page_src=TEXT
20
+)
21
+
22
+
23
+def mk_whooshpath_if_needed():
24
+    if not os.path.exists(settings.WHOOSH_PATH):
25
+        fstools.mkdir(settings.WHOOSH_PATH)
26
+
27
+
28
+def create_index():
29
+    mk_whooshpath_if_needed()
30
+    logger.debug('Search Index created.')
31
+    return index.create_in(settings.WHOOSH_PATH, schema=SCHEMA)
32
+
33
+
34
+def rebuild_index(ix):
35
+    page_path = fstools.dirlist(settings.PAGES_ROOT, rekursive=False)
36
+    for path in page_path:
37
+        bp = base_page(path)
38
+        add_item(ix, bp)
39
+    return len(page_path)
40
+
41
+
42
+def load_index():
43
+    mk_whooshpath_if_needed()
44
+    try:
45
+        ix = index.open_dir(settings.WHOOSH_PATH)
46
+    except index.EmptyIndexError:
47
+        ix = create_index()
48
+    else:
49
+        logger.debug('Search Index opened.')
50
+    return ix
51
+
52
+
53
+def add_item(ix, bp: base_page):
54
+    # Define Standard data
55
+    #
56
+    data = dict(
57
+        id=bp.rel_path,
58
+        title=bp.title,
59
+        page_src=bp.raw_page_src
60
+    )
61
+    with ix.writer() as w:
62
+        logger.info('Adding document with id=%s to the search index.', data.get('id'))
63
+        w.add_document(**data)
64
+        for key in data:
65
+            logger.debug('  - Adding %s=%s', key, repr(data[key]))
66
+
67
+
68
+def whoosh_search(search_txt):
69
+    ix = load_index()
70
+    qp = qparser.MultifieldParser(['title', 'page_src'], ix.schema)
71
+    qp.add_plugin(DateParserPlugin(free=True))
72
+    try:
73
+        q = qp.parse(search_txt)
74
+    except AttributeError:
75
+        return None
76
+    except Exception:
77
+        return None
78
+    with ix.searcher() as s:
79
+        results = s.search(q, limit=None)
80
+        rpl = []
81
+        for hit in results:
82
+            rpl.append(hit['id'])
83
+        return rpl
84
+
85
+
86
+def delete_item(ix, bp: base_page):
87
+    with ix.writer() as w:
88
+        logger.info('Removing document with id=%s from the search index.', bp.rel_path)
89
+        w.delete_by_term("task_id", bp.rel_path)
90
+
91
+
92
+def update_item(bp: base_page):
93
+    ix = load_index()
94
+    delete_item(ix, bp)
95
+    add_item(ix, bp)
96
+

+ 18
- 4
pages/views.py Zobrazit soubor

@@ -8,12 +8,14 @@ import logging
8 8
 from . import access
9 9
 from . import messages
10 10
 from . import url_page
11
+from . import get_search_query
11 12
 import config
12 13
 from .context import context_adaption
13 14
 from .forms import EditForm
14 15
 from .help import help_pages
15 16
 import mycreole
16
-from .page import creol_page
17
+from .page import creole_page
18
+from .search import whoosh_search
17 19
 from themes import Context
18 20
 
19 21
 logger = logging.getLogger(settings.ROOT_LOGGER_NAME).getChild(__name__)
@@ -26,7 +28,7 @@ def root(request):
26 28
 def page(request, rel_path):
27 29
     context = Context(request)      # needs to be executed first because of time mesurement
28 30
     #
29
-    p = creol_page(request, rel_path)
31
+    p = creole_page(request, rel_path)
30 32
     if access.read_page(request, rel_path):
31 33
         page_content = p.render_to_html()
32 34
     else:
@@ -48,7 +50,7 @@ def edit(request, rel_path):
48 50
     if access.write_page(request, rel_path):
49 51
         context = Context(request)      # needs to be executed first because of time mesurement
50 52
         #
51
-        p = creol_page(request, rel_path)
53
+        p = creole_page(request, rel_path)
52 54
         #
53 55
         if not request.POST:
54 56
             form = EditForm(page_data=p.raw_page_src)
@@ -92,10 +94,22 @@ def edit(request, rel_path):
92 94
 
93 95
 def search(request):
94 96
     context = Context(request)      # needs to be executed first because of time mesurement
97
+    #
98
+    search_txt = get_search_query(request)
99
+
100
+    sr = whoosh_search(search_txt)
101
+    if sr is None:
102
+        messages.error(request, _('Invalid search pattern: %s') % repr(search_txt))
103
+        sr = []
104
+    page_content = "= Searchresults\n"
105
+    for rel_path in sr:
106
+        p = creole_page(request, rel_path)
107
+        page_content += f"[[/page/{rel_path}|{p.title}]]\n"
108
+    #
95 109
     context_adaption(
96 110
         context,
97 111
         request,
98
-        page_content="Search is not yet implemented..."
112
+        page_content=mycreole.render_simple(page_content)
99 113
     )
100 114
     return render(request, 'pages/page.html', context=context)
101 115
 

+ 2
- 1
piki/settings.py Zobrazit soubor

@@ -135,6 +135,8 @@ MYCREOLE_BAR = {
135 135
 
136 136
 PAGES_ROOT = os.path.join(BASE_DIR, 'data', 'pages')
137 137
 
138
+WHOOSH_PATH = os.path.join(BASE_DIR, 'data', 'whoosh')
139
+
138 140
 # Default primary key field type
139 141
 # https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
140 142
 
@@ -215,4 +217,3 @@ File "%(pathname)s", line %(lineno)d, in %(funcName)s
215 217
         },
216 218
     },
217 219
 }
218
-

+ 1
- 1
requirements.txt Zobrazit soubor

@@ -2,4 +2,4 @@ Django
2 2
 Pillow
3 3
 python-creole
4 4
 pytz
5
-
5
+Whoosh

Načítá se…
Zrušit
Uložit