New TGRM search implementation

This commit is contained in:
2021-12-05 23:54:50 +03:00
parent 5fbe26c0f2
commit 7e825d099b
8 changed files with 91 additions and 74 deletions

View File

@@ -1,4 +1,5 @@
from typing import Optional from typing import Optional
from datetime import date
from pydantic import BaseModel from pydantic import BaseModel
@@ -44,6 +45,9 @@ class AuthorBook(BaseModel):
title: str title: str
lang: str lang: str
file_type: str file_type: str
available_types: list[str]
uploaded: date
annotation_exists: bool
class Config(ORJSONConfig): class Config(ORJSONConfig):
pass pass

View File

@@ -3,11 +3,40 @@ from app.models import Author
from app.services.common import TRGMSearchService from app.services.common import TRGMSearchService
GET_OBJECTS_IDS_QUERY = """
SELECT ARRAY(
WITH filtered_authors AS (
SELECT
id,
GREATEST(
similarity((last_name || ' ' || first_name || ' ' || middle_name), :query),
similarity((last_name || ' ' || first_name), :query),
similarity((last_name), :query)
) as sml,
(
SELECT count(*) FROM book_authors
LEFT JOIN books ON books.id = book
WHERE author = authors.id AND books.is_deleted = 'f'
) as books_count
FROM authors
WHERE (
(last_name || ' ' || first_name || ' ' || middle_name) % :query OR
(last_name || ' ' || first_name) % :query OR
(last_name) % :query
) AND
EXISTS (
SELECT * FROM book_authors
LEFT JOIN books ON books.id = book
WHERE author = authors.id AND books.is_deleted = 'f'
)
)
SELECT fauthors.id FROM filtered_authors as fauthors
ORDER BY fauthors.sml DESC, fauthors.books_count DESC
);
"""
class AuthorTGRMSearchService(TRGMSearchService): class AuthorTGRMSearchService(TRGMSearchService):
MODEL_CLASS = Author MODEL_CLASS = Author
FIELDS = [
Author.Meta.table.c.last_name,
Author.Meta.table.c.first_name,
Author.Meta.table.c.middle_name
]
PREFETCH_RELATED = ["source", "annotations"] PREFETCH_RELATED = ["source", "annotations"]
GET_OBJECT_IDS_QUERY = GET_OBJECTS_IDS_QUERY

View File

@@ -8,15 +8,22 @@ from app.services.common import TRGMSearchService
from app.serializers.book import CreateBook, CreateRemoteBook from app.serializers.book import CreateBook, CreateRemoteBook
GET_OBJECTS_IDS_QUERY = """
SELECT ARRAY(
WITH filtered_books AS (
SELECT id, similarity(title, :query) as sml FROM books
WHERE books.title % :query
)
SELECT fbooks.id FROM filtered_books as fbooks
ORDER BY fbooks.sml DESC, fbooks.id
);
"""
class BookTGRMSearchService(TRGMSearchService): class BookTGRMSearchService(TRGMSearchService):
MODEL_CLASS = BookDB MODEL_CLASS = BookDB
FIELDS = [
BookDB.Meta.table.c.title
]
PREFETCH_RELATED = ["source", "authors", "annotations"] PREFETCH_RELATED = ["source", "authors", "annotations"]
FILTERS = [ GET_OBJECT_IDS_QUERY = GET_OBJECTS_IDS_QUERY
BookDB.Meta.table.c.is_deleted == False,
]
class BookCreator: class BookCreator:

View File

@@ -1,7 +1,5 @@
from typing import Optional, Generic, TypeVar, Union from typing import Optional, Generic, TypeVar, Union
from itertools import permutations
from databases import Database from databases import Database
import json
from fastapi_pagination.api import resolve_params from fastapi_pagination.api import resolve_params
from fastapi_pagination.bases import AbstractParams, RawParams from fastapi_pagination.bases import AbstractParams, RawParams
@@ -10,17 +8,7 @@ import aioredis
import orjson import orjson
from ormar import Model, QuerySet from ormar import Model, QuerySet
from sqlalchemy import text, func, select, or_, Table, Column, cast, Text from sqlalchemy import Table
from sqlalchemy.orm import Session
def join_fields(fields):
result = fields[0]
for el in fields[1:]:
result += text("' '") + el
return result
T = TypeVar('T', bound=Model) T = TypeVar('T', bound=Model)
@@ -28,10 +16,9 @@ T = TypeVar('T', bound=Model)
class TRGMSearchService(Generic[T]): class TRGMSearchService(Generic[T]):
MODEL_CLASS: Optional[T] = None MODEL_CLASS: Optional[T] = None
FIELDS: Optional[list[Column]] = None
SELECT_RELATED: Optional[Union[list[str], str]] = None SELECT_RELATED: Optional[Union[list[str], str]] = None
PREFETCH_RELATED: Optional[Union[list[str], str]] = None PREFETCH_RELATED: Optional[Union[list[str], str]] = None
FILTERS = [] GET_OBJECT_IDS_QUERY: Optional[str] = None
CACHE_TTL = 5 * 60 CACHE_TTL = 5 * 60
@classmethod @classmethod
@@ -60,52 +47,18 @@ class TRGMSearchService(Generic[T]):
@classmethod @classmethod
@property @property
def fields_combinations(cls): def object_ids_query(cls) -> str:
assert cls.FIELDS is not None, f"FIELDS in {cls.__name__} don't set!" assert cls.GET_OBJECT_IDS_QUERY is not None, f"GET_OBJECT_IDS_QUERY in {cls.__name__} don't set!"
assert len(cls.FIELDS) != 0, f"FIELDS in {cls.__name__} must be not empty!" return cls.GET_OBJECT_IDS_QUERY
return permutations(cls.FIELDS, len(cls.FIELDS))
@classmethod
def get_similarity_subquery(cls, query: str):
combs = cls.fields_combinations
return func.greatest(
*[func.similarity(join_fields(comb), cast(query, Text)) for comb in combs]
).label("sml")
@classmethod
def get_similarity_filter_subquery(cls, query: str):
return or_(
*[join_fields(comb) % f"{query}::text" for comb in cls.fields_combinations]
)
@classmethod @classmethod
async def _get_object_ids(cls, query_data: str) -> list[int]: async def _get_object_ids(cls, query_data: str) -> list[int]:
similarity = cls.get_similarity_subquery(query_data) row = await cls.database.fetch_one(cls.object_ids_query, {"query": query_data})
similarity_filter = cls.get_similarity_filter_subquery(query_data)
session = Session(cls.database.connection())
filtered_objects_query = session.query(
cls.table.c.id, similarity
).order_by(
text('sml DESC')
).filter(
similarity_filter,
*cls.FILTERS
).cte('objs')
object_ids_query = session.query(
func.array_agg(filtered_objects_query.c.id)
).cte()
row = await cls.database.fetch_one(object_ids_query)
if row is None: if row is None:
raise ValueError('Something is wrong!') raise ValueError('Something is wrong!')
return row['array_agg_1'] return row['array']
@classmethod @classmethod
def get_cache_key(cls, query_data: str) -> str: def get_cache_key(cls, query_data: str) -> str:

View File

@@ -3,9 +3,32 @@ from app.models import Sequence
from app.services.common import TRGMSearchService from app.services.common import TRGMSearchService
GET_OBJECTS_IDS_QUERY = """
EXPLAIN ANALYZE SELECT ARRAY (
WITH filtered_sequences AS (
SELECT
id,
similarity(name, :query) as sml,
(
SELECT count(*) FROM book_sequences
LEFT JOIN books ON books.id = book
WHERE sequence = sequences.id AND books.is_deleted = 'f'
) as books_count
FROM sequences
WHERE name % :query AND
EXISTS (
SELECT * FROM book_sequences
LEFT JOIN books ON books.id = book
WHERE sequence = sequences.id AND books.is_deleted = 'f'
)
)
SELECT fsequences.id FROM filtered_sequences as fsequences
ORDER BY fsequences.sml DESC, fsequences.books_count DESC
);
"""
class SequenceTGRMSearchService(TRGMSearchService): class SequenceTGRMSearchService(TRGMSearchService):
MODEL_CLASS = Sequence MODEL_CLASS = Sequence
FIELDS = [
Sequence.Meta.table.c.name
]
PREFETCH_RELATED = ["source"] PREFETCH_RELATED = ["source"]
GET_OBJECTS_IDS_QUERY = GET_OBJECTS_IDS_QUERY

View File

@@ -1,6 +1,6 @@
from typing import Protocol, TypeVar, Any, Generic, Sequence, runtime_checkable from typing import Protocol, TypeVar, Any, Generic, Sequence, runtime_checkable
from pydantic import PositiveInt from pydantic import conint
from fastapi_pagination import Page, Params from fastapi_pagination import Page, Params
from fastapi_pagination.bases import AbstractParams from fastapi_pagination.bases import AbstractParams
@@ -16,7 +16,7 @@ T = TypeVar('T', ToDict, Any)
class CustomPage(Page[T], Generic[T]): class CustomPage(Page[T], Generic[T]):
total_pages: PositiveInt total_pages: conint(ge=0) # type: ignore
@classmethod @classmethod
def create( def create(

View File

@@ -69,7 +69,7 @@ async def get_author_annotation(id: int):
@author_router.get("/{id}/books", response_model=CustomPage[AuthorBook], dependencies=[Depends(Params)]) @author_router.get("/{id}/books", response_model=CustomPage[AuthorBook], dependencies=[Depends(Params)])
async def get_author_books(id: int): async def get_author_books(id: int):
return await paginate( return await paginate(
BookDB.objects.filter(author__id=id).order_by('title') BookDB.objects.select_related(["source", "annotations"]).filter(authors__id=id).order_by('title')
) )

View File

@@ -6,8 +6,9 @@ from fastapi_pagination import Params
from fastapi_pagination.ext.ormar import paginate from fastapi_pagination.ext.ormar import paginate
from app.utils.pagination import CustomPage from app.utils.pagination import CustomPage
from app.models import Book as BookDB, Author as AuthorDB, AuthorAnnotation as AuthorAnnotationDB from app.models import Book as BookDB, Author as AuthorDB, BookAnnotation as BookAnnotationDB
from app.serializers.book import Book, RemoteBook, BookDetail, CreateBook, UpdateBook, CreateRemoteBook from app.serializers.book import Book, RemoteBook, BookDetail, CreateBook, UpdateBook, CreateRemoteBook
from app.serializers.book_annotation import BookAnnotation
from app.services.book import BookTGRMSearchService, BookCreator from app.services.book import BookTGRMSearchService, BookCreator
from app.filters.book import get_book_filter from app.filters.book import get_book_filter
from app.depends import check_token from app.depends import check_token
@@ -82,9 +83,9 @@ async def update_book(id: int, data: UpdateBook):
return book return book
@book_router.get("/{id}/annotation") @book_router.get("/{id}/annotation", response_model=BookAnnotation)
async def get_book_annotation(id: int): async def get_book_annotation(id: int):
annotation = await AuthorAnnotationDB.objects.get(book__id=id) annotation = await BookAnnotationDB.objects.get(book__id=id)
if annotation is None: if annotation is None:
raise HTTPException(status.HTTP_404_NOT_FOUND) raise HTTPException(status.HTTP_404_NOT_FOUND)