Skip to content

Commit

Permalink
very basic solr/xapian backend integration
Browse files Browse the repository at this point in the history
  • Loading branch information
ciur committed Aug 19, 2023
1 parent ce9b583 commit 1e60da1
Show file tree
Hide file tree
Showing 8 changed files with 188 additions and 104 deletions.
15 changes: 13 additions & 2 deletions dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ x-backend: &common # yaml anchor definition
- ./ui:/core_ui/
- ./docker/dev/db:/db
- ./media_root:/core_app/media
- ./index_db:/core_app/index_db
environment:
PAPERMERGE__SECURITY__SECRET_KEY: ${PAPERMERGE__SECURITY__SECRET_KEY}
PAPERMERGE__DATABASE__URL: ${PAPERMERGE__DATABASE__URL}
Expand All @@ -28,7 +27,7 @@ x-backend: &common # yaml anchor definition
PAPERMERGE__AUTH__GITHUB_CLIENT_ID: ${PAPERMERGE__AUTH__GITHUB_CLIENT_ID}
PAPERMERGE__AUTH__GITHUB_AUTHORIZE_URL: ${PAPERMERGE__AUTH__GITHUB_AUTHORIZE_URL}
PAPERMERGE__AUTH__GITHUB_REDIRECT_URI: ${PAPERMERGE__AUTH__GITHUB_REDIRECT_URI}

PAPERMERGE__SEARCH__URL: ${PAPERMERGE__SEARCH__URL}
services:
backend:
<<: *common
Expand All @@ -39,3 +38,15 @@ services:
command: worker
redis:
image: redis:6
solr:
image: solr:9.3
ports:
- "8983:8983"
volumes:
- data:/var/solr
command:
- solr-precreate
- pmg_index

volumes:
data:
15 changes: 7 additions & 8 deletions papermerge/search/management/commands/index.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from django.conf import settings
from django.core.management.base import BaseCommand
from salinic import Session, create_engine
from salinic.engine import AccessMode
from salinic import IndexRO, create_engine

from papermerge.core.models import BaseTreeNode
from papermerge.search.schema import FOLDER, PAGE, ColoredTag, IndexEntity
from papermerge.search.schema import FOLDER, PAGE, ColoredTag, Model


class Command(BaseCommand):
Expand All @@ -23,8 +22,8 @@ def handle(self, *args, **options):
else:
nodes = BaseTreeNode.objects.all()

engine = create_engine(settings.SEARCH_URL, mode=AccessMode.RW)
session = Session(engine)
engine = create_engine(settings.SEARCH_URL)
index = IndexRO(engine, schema=Model)

for node in nodes:
self.stdout.write(f"Indexing {node}")
Expand All @@ -37,7 +36,7 @@ def handle(self, *args, **options):
last_ver = doc.versions.last()

for page in last_ver.pages.all():
index_entity = IndexEntity(
model = Model(
id=str(page.id),
title=node.title,
user_id=str(node.user_id),
Expand All @@ -60,7 +59,7 @@ def handle(self, *args, **options):
]
)
else:
index_entity = IndexEntity(
model = Model(
id=str(node.id),
title=node.title,
user_id=str(node.user_id),
Expand All @@ -80,4 +79,4 @@ def handle(self, *args, **options):
)

if index_entity:
session.add(index_entity)
index.add(model)
10 changes: 5 additions & 5 deletions papermerge/search/management/commands/search.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from django.conf import settings
from django.core.management.base import BaseCommand
from salinic import Search, Session, create_engine
from salinic import IndexRO, Search, create_engine

from papermerge.search.schema import IndexEntity
from papermerge.search.schema import Model


class Command(BaseCommand):
Expand All @@ -21,9 +21,9 @@ def handle(self, *args, **options):
querystring = querystring.replace('-', '').lower()

engine = create_engine(settings.SEARCH_URL)
session = Session(engine)
index = IndexRO(engine, schema=Model)

sq = Search(IndexEntity).query(querystring)
sq = Search(Model).query(querystring)

for entity in session.exec(sq):
for entity in index.search(sq):
self.stdout.write(f'{entity}')
12 changes: 6 additions & 6 deletions papermerge/search/routers/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

from django.conf import settings
from fastapi import APIRouter, Depends
from salinic import Search, Session, create_engine
from salinic import IndexRO, Search, create_engine

from papermerge.core.models import User
from papermerge.core.routers.auth import get_current_user as current_user
from papermerge.search.schema import IndexEntity
from papermerge.search.schema import Model

router = APIRouter(
prefix="/search",
Expand All @@ -18,13 +18,13 @@
def search(
q: str,
user: User = Depends(current_user)
) -> List[IndexEntity]:
) -> List[Model]:
engine = create_engine(settings.SEARCH_URL)
session = Session(engine)
index = IndexRO(engine, schema=Model)

sq = Search(IndexEntity).query(q)
sq = Search(Model).query(q)

results: List[IndexEntity] = session.exec(sq)
results: List[Model] = index.search(sq)

# show results only of the documents belonging to the current user
return [
Expand Down
65 changes: 52 additions & 13 deletions papermerge/search/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pydantic import BaseModel
from salinic import types
from salinic.field import IdField, KeywordField
from salinic.field import KeywordField, TextField, UUIDField
from salinic.schema import Schema
from typing_extensions import Annotated

Expand All @@ -26,26 +26,65 @@ class ColoredTag(BaseModel):
]


class IndexEntity(Schema):
class Model(Schema):
"""Index entity
Documents are indexed by page. Note that we place in same index
both folders and documents, and because the main index entity is page -
we end up having in index two types of entities: folders and pages.
"""
id: Annotated[str, IdField(primary_key=True)] # page id | node_id
id: Annotated[
str,
UUIDField(primary_key=True, general_search=True)
] # page id | node_id

# document ID to whom this page belongs
document_id: Annotated[Optional[str], IdField()] = None
# ID of the document version
document_version_id: Annotated[Optional[str], IdField()] = None
user_id: str
parent_id: str
title: types.Text # document or folder title
document_id: Annotated[
Optional[str],
UUIDField(index=False, general_search=True)
] = None

lang: Annotated[
str,
KeywordField()
] = 'en'

user_id: Annotated[
str,
UUIDField(index=False)
]

parent_id: Annotated[
str,
UUIDField(index=False)
]

title: Annotated[
str,
TextField(general_search=True, multi_lang=True)
] # document or folder title

# text is None in case folder entity
text: types.OptionalText = None
entity_type: types.Keyword # Folder | Page
breadcrumb: Breadcrumb
tags: Tags = []
text: Annotated[
Optional[str],
TextField(general_search=True, multi_lang=True)
] = None

entity_type: Annotated[
str,
KeywordField()
] # folder | page

breadcrumb: Annotated[
List[Tuple[str, str]],
KeywordField(multi_value=True)
]

tags: Annotated[
Optional[list[ColoredTag]],
KeywordField(multi_value=True)
] = []

# None in case of folder entity
page_number: types.OptionalNumeric = None
# None in case of folder entity
Expand Down
123 changes: 61 additions & 62 deletions papermerge/search/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,72 @@

from celery import shared_task
from django.conf import settings
from salinic import Session, create_engine
from salinic.engine import AccessMode
from salinic import IndexRW, create_engine

from papermerge.core.constants import INDEX_ADD_NODE, INDEX_REMOVE_NODE
from papermerge.core.models import BaseTreeNode
from papermerge.search.schema import FOLDER, PAGE, ColoredTag, IndexEntity
from papermerge.search.schema import FOLDER, PAGE, ColoredTag, Model

logger = logging.getLogger(__name__)


def from_folder(node: BaseTreeNode) -> IndexEntity:
index_entity = IndexEntity(
@shared_task(name=INDEX_ADD_NODE)
def index_add_node(node_id: str):
"""Add node to the search index
Add operation means either insert or update depending
on if folder entity is already present in the index.
In other words, if folder was already indexed (added before), its record
in index will be updated otherwise its record will be inserted.
"""
logger.warning(f'INDEX ADD NODE {node_id}')
try:
# may happen when using xapian search backend and multiple
# workers try to get write access to the index
engine = create_engine(settings.SEARCH_URL)
except Exception as e:
logger.warning(f"Exception '{e}' occured while opening engine")
logger.warning(f"Index add for {node_id} interruped")
return

index = IndexRW(engine, schema=Model)

node = BaseTreeNode.objects.get(pk=node_id)
if node.is_document:
models = from_document(node)
else:
models = [from_folder(node)]

for model in models:
index.add(model)


@shared_task(name=INDEX_REMOVE_NODE)
def index_remove_node(node_ids: List[str]):
"""Removes node from the search index
"""
logger.warning(f'INDEX REMOVE NODE {node_ids}')
try:
engine = create_engine(settings.SEARCH_URL)
except Exception as e:
# may happen when using xapian search backend and multiple
# workers try to get write access to the index
logger.warning(f"Exception '{e}' occured while opening engine")
logger.warning(f"Index remove for {node_ids} interruped")
return

index = IndexRW(engine, schema=Model)

for node_id in node_ids:
id_term = f"ID{node_id}"
index.remove(id_term)

document_id_term = f"DOCUMENT_ID{node_id.replace('-', '')}"
index.remove(document_id_term)


def from_folder(node: BaseTreeNode) -> Model:
index_entity = Model(
id=str(node.id),
title=node.title,
user_id=str(node.user_id),
Expand All @@ -35,13 +89,13 @@ def from_folder(node: BaseTreeNode) -> IndexEntity:
return index_entity


def from_document(node: BaseTreeNode) -> List[IndexEntity]:
def from_document(node: BaseTreeNode) -> List[Model]:
result = []
doc = node.document
last_ver = doc.versions.last()

for page in last_ver.pages.all():
index_entity = IndexEntity(
index_entity = Model(
id=str(page.id),
title=node.title,
user_id=str(node.user_id),
Expand All @@ -66,58 +120,3 @@ def from_document(node: BaseTreeNode) -> List[IndexEntity]:
result.append(index_entity)

return result


@shared_task(name=INDEX_ADD_NODE)
def index_add_node(node_id: str):
"""Add node to the search index
Add operation means either insert or update depending
on if folder entity is already present in the index.
In other words, if folder was already indexed (added before), its record
in index will be updated otherwise its record will be inserted.
"""
logger.warning(f'INDEX ADD NODE {node_id}')
try:
# may happen when using xapian search backend and multiple
# workers try to get write access to the index
engine = create_engine(settings.SEARCH_URL, mode=AccessMode.RW)
except Exception as e:
logger.warning(f"Exception '{e}' occured while opening engine")
logger.warning(f"Index add for {node_id} interruped")
return

session = Session(engine)

node = BaseTreeNode.objects.get(pk=node_id)
if node.is_document:
index_entities = from_document(node)
else:
index_entities = [from_folder(node)]

for entity in index_entities:
session.add(entity)


@shared_task(name=INDEX_REMOVE_NODE)
def index_remove_node(node_ids: List[str]):
"""Removes node from the search index
"""
logger.warning(f'INDEX REMOVE NODE {node_ids}')
try:
engine = create_engine(settings.SEARCH_URL, mode=AccessMode.RW)
except Exception as e:
# may happen when using xapian search backend and multiple
# workers try to get write access to the index
logger.warning(f"Exception '{e}' occured while opening engine")
logger.warning(f"Index remove for {node_ids} interruped")
return

session = Session(engine)

for node_id in node_ids:
id_term = f"ID{node_id}"
session.remove(id_term)

document_id_term = f"DOCUMENT_ID{node_id.replace('-', '')}"
session.remove(document_id_term)
Loading

0 comments on commit 1e60da1

Please sign in to comment.