add endpoint /curation/suggest/languages

ccf11b6f · Frank Lange · 1bc7327e · ccf11b6f · ccf11b6f · ccf11b6f
Commit ccf11b6f authored 3 months ago by Frank Lange
--- a/project/dalia/curation/suggest/languages.py
+++ b/project/dalia/curation/suggest/languages.py
+from typing import List
+
+from rdflib import BNode, Graph, Literal, Variable
+from rdflib.collection import Collection
+
+from project.dalia.api_models.api_models import (
+    CurationSuggestPaginatedResult,
+    CurationSuggestSearchRequest,
+    LabelValueItem,
+)
+from project.dalia.query.utils import query_ontologies_dataset
+from project.dalia.query_builder.query_builder import (
+    Aggregates,
+    BIND,
+    FILTER,
+    FunctionExpressions,
+    OPTIONAL,
+    Operators,
+    QueryBuilder,
+)
+from project.dalia.rdf.namespace import Jena_text, SKOS_last_call, lvont
+
+
+# data for endpoint /curation/suggest/languages
+def get_languages_suggestions(request: CurationSuggestSearchRequest) -> CurationSuggestPaginatedResult:
+    query = request.q + "*"
+    limit = request.limit
+    offset = request.offset
+
+    return CurationSuggestPaginatedResult(
+        count=count_results_from_languages_search(query),
+        offset=offset,
+        limit=limit,
+        results=_search_languages_and_retrieve_labels(query, limit, offset)
+    )
+
+
+_VARIABLES = {
+    "lang": Variable("lang"),
+    "label": Variable("label"),
+}
+
+
+def _where_for_text_search(query: str, var_lang: Variable, var_score: Variable, var_label: Variable):
+    subject_list_for_text_search = Collection(Graph(), BNode(), [var_lang, var_score, var_label])
+    object_list_for_text_search = Collection(Graph(), BNode(), [SKOS_last_call.prefLabel, Literal(query)])
+
+    where = [
+        (subject_list_for_text_search, Jena_text.query, object_list_for_text_search),
+        FILTER(
+            Operators.EQ(
+                FunctionExpressions.LANG(var_label),
+                Literal("en")
+            )
+        ),
+    ]
+
+    return tuple(where)
+
+
+def prepare_query_for_language_search_and_label_retrieval(query: str, limit: int, offset: int) -> str:
+    var_lang = _VARIABLES["lang"]
+    var_score = Variable("score")
+    var_label = _VARIABLES["label"]
+    var_iso639P1Code = Variable("iso639P1Code")
+    var_boundIso639P1Code = Variable("boundIso639P1Code")
+
+    return QueryBuilder().SELECT(
+        *_VARIABLES.values()
+    ).WHERE(
+        *_where_for_text_search(query, var_lang, var_score, var_label),
+        OPTIONAL(
+            (var_lang, lvont.iso639P1Code, var_iso639P1Code),
+        ),
+        # In case of identical text search scores we would like to show languages with ISO639-1 code first, but
+        # ?iso639P1Code might be unbound. This causes problems, because in the ORDER BY clause unbound values come first
+        # (see https://www.w3.org/TR/sparql11-query/#modOrderBy).
+        # The workaround is to bind ?boundIso639P1Code to the boolean "false" in case ?iso639P1Code is unbound. In
+        # Apache Jena's SPARQL engine strings have a lower order than booleans in the ORDER BY clause.
+        BIND(FunctionExpressions.COALESCE(var_iso639P1Code, Literal(False)), var_boundIso639P1Code),
+    ).ORDER_BY(
+        FunctionExpressions.DESC(var_score),
+        var_boundIso639P1Code,
+    ).LIMIT(limit).OFFSET(offset).build()
+
+
+def prepare_query_for_count_in_language_search(query: str) -> str:
+    var_lang = _VARIABLES["lang"]
+    var_score = Variable("score")
+    var_label = _VARIABLES["label"]
+
+    return QueryBuilder().SELECT(
+        count=Aggregates.COUNT(var_lang),
+    ).WHERE(
+        *_where_for_text_search(query, var_lang, var_score, var_label),
+    ).build()
+
+
+def _search_languages_and_retrieve_labels(query: str, limit: int, offset: int) -> List[LabelValueItem]:
+    sparql_query = prepare_query_for_language_search_and_label_retrieval(query, limit, offset)
+    results = query_ontologies_dataset(sparql_query)
+    return [_process_result_from_metadata_retrieval(result) for result in results]
+
+
+def _process_result_from_metadata_retrieval(result) -> LabelValueItem:
+    return LabelValueItem(
+        value=str(result.lang),
+        label=str(result.label)
+    )
+
+
+def count_results_from_languages_search(query: str) -> int:
+    sparql_query = prepare_query_for_count_in_language_search(query)
+    results = query_ontologies_dataset(sparql_query)
+    return next(results.__iter__()).get("count").toPython()
--- a/project/dalia/rdf/namespace/lvont.py
+++ b/project/dalia/rdf/namespace/lvont.py
+"""
+Terms from the Lexvo.org Ontology (http://lexvo.org/ontology)
+"""
+from rdflib import URIRef
+
+NS = "http://lexvo.org/ontology#"
+
+# Properties
+iso639P1Code = URIRef(NS + "iso639P1Code")
--- a/project/dalia/urls.py
+++ b/project/dalia/urls.py
@@ -14,6 +14,7 @@ urlpatterns = [
    path('v1/items/<uuid:resource_id>/suggestions', views.ItemSuggestionsView.as_view(), name="item_suggestions"),
    path('v1/curation/suggest/communities', views.CurationSuggestCommunitiesView.as_view(), name="curation_suggest_communities"),
    path('v1/curation/suggest/learning-resource-types', views.CurationSuggestLearningResourceTypesView.as_view(), name="curation_suggest_learning_resource_types"),
+    path('v1/curation/suggest/languages', views.CurationSuggestLanguagesView.as_view(), name="curation_suggest_languages"),
    path('v1/curation/suggest/disciplines', views.CurationSuggestDisciplinesView.as_view(), name="curation_suggest_disciplines"),
    path('v1/curation/suggest/licenses', views.CurationSuggestLicensesView.as_view(), name="curation_suggest_licenses"),
    path('v1/curation/suggest/proficiency-levels', views.CurationSuggestProficiencyLevelsView.as_view(), name="curation_suggest_proficiency_levels"),

--- a/project/dalia/views.py
+++ b/project/dalia/views.py
@@ -8,6 +8,7 @@ from rest_framework.views import APIView
 from project.dalia.api_models.api_models import ItemSearchResult
 from project.dalia.curation.suggest.communities import get_communities_suggestions
 from project.dalia.curation.suggest.disciplines import get_disciplines_suggestions
+from project.dalia.curation.suggest.languages import get_languages_suggestions
 from project.dalia.curation.suggest.learning_resource_types import get_learning_resource_types_suggestions
 from project.dalia.curation.suggest.media_types import get_media_types_suggestions
 from project.dalia.curation.suggest.proficiency_levels import get_proficiency_levels_suggestions
@@ -114,6 +115,18 @@ class CurationSuggestLearningResourceTypesView(APIView):
        return Response(serializer.data)


+# endpoint /curation/suggest/languages
+class CurationSuggestLanguagesView(APIView):
+    def get(self, request: Request):
+        request_serializer = serializers.CurationSuggestSearchRequestSerializer(data=request.query_params)
+        request_serializer.is_valid(raise_exception=True)
+
+        result_serializer = serializers.CurationSuggestPaginatedResultSerializer(
+            get_languages_suggestions(request_serializer.validated_data)
+        )
+        return Response(result_serializer.data)
+
+
 # endpoint /curation/suggest/disciplines
 class CurationSuggestDisciplinesView(APIView):
    def get(self, request: Request):

--- a/tests/project/dalia/curation/suggest/test_languages.py
+++ b/tests/project/dalia/curation/suggest/test_languages.py
+import pytest
+from django.urls import reverse
+from rest_framework import status
+from rest_framework.exceptions import ErrorDetail
+
+from project.dalia.api_models.api_models import (
+    CurationSuggestPaginatedResult,
+    CurationSuggestSearchRequest,
+    LabelValueItem,
+)
+from project.dalia.curation.suggest.languages import (
+    count_results_from_languages_search,
+    get_languages_suggestions,
+    prepare_query_for_count_in_language_search,
+    prepare_query_for_language_search_and_label_retrieval,
+)
+from project.dalia.serializers import CurationSuggestPaginatedResultSerializer
+from tests.project.dalia.utils import dedent_and_normalize, normalize
+
+
+def test_prepare_query_for_language_search_and_label_retrieval():
+    query = prepare_query_for_language_search_and_label_retrieval(
+        query="fr*",
+        limit=10,
+        offset=42
+    )
+
+    assert normalize(query) == dedent_and_normalize("""
+        SELECT ?lang ?label  
+        WHERE { 
+        ( ?lang ?score ?label ) <http://jena.apache.org/text#query> ( <http://www.w3.org/2008/05/skos#prefLabel> "fr*" ) . 
+        FILTER ( LANG ( ?label ) = "en" ) . 
+        OPTIONAL { 
+        ?lang <http://lexvo.org/ontology#iso639P1Code> ?iso639P1Code .
+        } . 
+        BIND ( COALESCE ( ?iso639P1Code, "false"^^<http://www.w3.org/2001/XMLSchema#boolean> ) AS ?boundIso639P1Code ) . 
+        } 
+        ORDER BY DESC ( ?score ) ?boundIso639P1Code 
+        LIMIT 10 
+        OFFSET 42 
+    """)
+
+
+def test_prepare_query_for_count_in_language_search():
+    query = prepare_query_for_count_in_language_search(query="fr*")
+
+    assert normalize(query) == dedent_and_normalize("""
+        SELECT (COUNT( ?lang ) as ?count)  
+        WHERE { 
+        ( ?lang ?score ?label ) <http://jena.apache.org/text#query> ( <http://www.w3.org/2008/05/skos#prefLabel> "fr*" ) . 
+        FILTER ( LANG ( ?label ) = "en" ) . 
+        } 
+    """)
+
+
+def test_get_languages_suggestions(triplestore):
+    request = CurationSuggestSearchRequest(q="en", limit=4, offset=13)
+
+    result = get_languages_suggestions(request)
+
+    assert result.count == 38
+    assert result.limit == 4
+    assert result.offset == 13
+
+    result_values = list(map(lambda result_item: result_item.value, result.results))
+    assert result_values == [
+        "http://lexvo.org/id/iso639-3/ptt",
+        "http://lexvo.org/id/iso639-3/enu",
+        "http://lexvo.org/id/iso639-3/enw",
+        "http://lexvo.org/id/iso639-3/env",
+    ]
+
+    result_labels = list(map(lambda result_item: result_item.label, result.results))
+    assert result_labels == [
+        "Enrekang",
+        "Enu",
+        "Enwan (Akwa Ibom State)",
+        "Enwan (Edu State)",
+    ]
+
+
+@pytest.mark.parametrize(
+    "query, expected_count",
+    [
+        ("*", 7771),
+        ("germ*", 11),
+        ("KLIN*", 1),
+    ]
+)
+def test_count_results_from_languages_search(triplestore, query, expected_count):
+    assert count_results_from_languages_search(query) == expected_count
+
+
+@pytest.mark.parametrize(
+    "request_data, expected_response_data",
+    [
+        (
+            {"limit": 2},
+            CurationSuggestPaginatedResult(
+                count=7771,
+                offset=0,
+                limit=2,
+                results=[
+                    LabelValueItem(label="Afar", value="http://lexvo.org/id/iso639-3/aar"),
+                    LabelValueItem(label="Abkhazian", value="http://lexvo.org/id/iso639-3/abk"),
+                ]
+            )
+        ),
+        (
+            {"q": "Klin"},
+            CurationSuggestPaginatedResult(
+                count=1,
+                offset=0,
+                limit=10,
+                results=[
+                    LabelValueItem(label="Klingon",value="http://lexvo.org/id/iso639-3/tlh"),
+                ]
+            )
+        ),
+        (
+            {"q": "fr", "limit": 4},
+            CurationSuggestPaginatedResult(
+                count=26,
+                offset=0,
+                limit=4,
+                results=[
+                    LabelValueItem(label="French", value="http://lexvo.org/id/iso639-3/fra"),
+                    LabelValueItem(label="Western Frisian", value="http://lexvo.org/id/iso639-3/fry"),
+                    LabelValueItem(label="Cajun French", value="http://lexvo.org/id/iso639-3/frc"),
+                    LabelValueItem(label="Eastern Frisian", value="http://lexvo.org/id/iso639-3/frs"),
+                ]
+            )
+        ),
+        (
+            {"q": "fre", "limit": 1, "offset": 10},
+            CurationSuggestPaginatedResult(
+                count=14,
+                offset=10,
+                limit=1,
+                results=[
+                    LabelValueItem(label="Saint Lucian Creole French", value="http://lexvo.org/id/iso639-3/acf"),
+                ]
+            )
+        ),
+        (
+            {"q": "abc", "limit": 100},
+            CurationSuggestPaginatedResult(
+                count=0,
+                offset=0,
+                limit=100,
+                results=[]
+            )
+        ),
+    ]
+)
+def test_get_on_CurationSuggestLanguagesView_returns_200_and_valid_data(
+        triplestore, api_client, request_data, expected_response_data
+):
+    response = api_client.get(reverse("curation_suggest_languages"), data=request_data)
+
+    assert response.status_code == status.HTTP_200_OK
+
+    serializer = CurationSuggestPaginatedResultSerializer(data=response.data)
+
+    assert serializer.is_valid()
+    data = serializer.validated_data
+
+    assert data == expected_response_data
+
+
+@pytest.mark.parametrize(
+    "request_data, expected_error_response",
+    [
+        (
+            {"limit": 0},
+            {
+                'limit': [ErrorDetail(string='Ensure this value is greater than or equal to 1.', code='min_value')],
+            },
+        ),
+        (
+            {"offset": -1},
+            {
+                'offset': [ErrorDetail(string='Ensure this value is greater than or equal to 0.', code='min_value')],
+            },
+        ),
+        (
+            {"limit": 0, "offset": -1},
+            {
+                'limit': [ErrorDetail(string='Ensure this value is greater than or equal to 1.', code='min_value')],
+                'offset': [ErrorDetail(string='Ensure this value is greater than or equal to 0.', code='min_value')],
+            },
+        ),
+    ]
+)
+def test_get_on_CurationSuggestLanguagesView_returns_400_for_invalid_request_data(
+        api_client, request_data, expected_error_response
+):
+    response = api_client.get(reverse("curation_suggest_languages"), data=request_data)
+
+    assert response.status_code == status.HTTP_400_BAD_REQUEST
+    assert response.data == expected_error_response