diff --git a/project/dalia/curation/suggest/languages.py b/project/dalia/curation/suggest/languages.py new file mode 100644 index 0000000000000000000000000000000000000000..935b62c1b8a78e1e60dcf167c334eeda774e01a9 --- /dev/null +++ b/project/dalia/curation/suggest/languages.py @@ -0,0 +1,115 @@ +from typing import List + +from rdflib import BNode, Graph, Literal, Variable +from rdflib.collection import Collection + +from project.dalia.api_models.api_models import ( + CurationSuggestPaginatedResult, + CurationSuggestSearchRequest, + LabelValueItem, +) +from project.dalia.query.utils import query_ontologies_dataset +from project.dalia.query_builder.query_builder import ( + Aggregates, + BIND, + FILTER, + FunctionExpressions, + OPTIONAL, + Operators, + QueryBuilder, +) +from project.dalia.rdf.namespace import Jena_text, SKOS_last_call, lvont + + +# data for endpoint /curation/suggest/languages +def get_languages_suggestions(request: CurationSuggestSearchRequest) -> CurationSuggestPaginatedResult: + query = request.q + "*" + limit = request.limit + offset = request.offset + + return CurationSuggestPaginatedResult( + count=count_results_from_languages_search(query), + offset=offset, + limit=limit, + results=_search_languages_and_retrieve_labels(query, limit, offset) + ) + + +_VARIABLES = { + "lang": Variable("lang"), + "label": Variable("label"), +} + + +def _where_for_text_search(query: str, var_lang: Variable, var_score: Variable, var_label: Variable): + subject_list_for_text_search = Collection(Graph(), BNode(), [var_lang, var_score, var_label]) + object_list_for_text_search = Collection(Graph(), BNode(), [SKOS_last_call.prefLabel, Literal(query)]) + + where = [ + (subject_list_for_text_search, Jena_text.query, object_list_for_text_search), + FILTER( + Operators.EQ( + FunctionExpressions.LANG(var_label), + Literal("en") + ) + ), + ] + + return tuple(where) + + +def prepare_query_for_language_search_and_label_retrieval(query: str, limit: int, offset: int) -> str: + var_lang = _VARIABLES["lang"] + var_score = Variable("score") + var_label = _VARIABLES["label"] + var_iso639P1Code = Variable("iso639P1Code") + var_boundIso639P1Code = Variable("boundIso639P1Code") + + return QueryBuilder().SELECT( + *_VARIABLES.values() + ).WHERE( + *_where_for_text_search(query, var_lang, var_score, var_label), + OPTIONAL( + (var_lang, lvont.iso639P1Code, var_iso639P1Code), + ), + # In case of identical text search scores we would like to show languages with ISO639-1 code first, but + # ?iso639P1Code might be unbound. This causes problems, because in the ORDER BY clause unbound values come first + # (see https://www.w3.org/TR/sparql11-query/#modOrderBy). + # The workaround is to bind ?boundIso639P1Code to the boolean "false" in case ?iso639P1Code is unbound. In + # Apache Jena's SPARQL engine strings have a lower order than booleans in the ORDER BY clause. + BIND(FunctionExpressions.COALESCE(var_iso639P1Code, Literal(False)), var_boundIso639P1Code), + ).ORDER_BY( + FunctionExpressions.DESC(var_score), + var_boundIso639P1Code, + ).LIMIT(limit).OFFSET(offset).build() + + +def prepare_query_for_count_in_language_search(query: str) -> str: + var_lang = _VARIABLES["lang"] + var_score = Variable("score") + var_label = _VARIABLES["label"] + + return QueryBuilder().SELECT( + count=Aggregates.COUNT(var_lang), + ).WHERE( + *_where_for_text_search(query, var_lang, var_score, var_label), + ).build() + + +def _search_languages_and_retrieve_labels(query: str, limit: int, offset: int) -> List[LabelValueItem]: + sparql_query = prepare_query_for_language_search_and_label_retrieval(query, limit, offset) + results = query_ontologies_dataset(sparql_query) + return [_process_result_from_metadata_retrieval(result) for result in results] + + +def _process_result_from_metadata_retrieval(result) -> LabelValueItem: + return LabelValueItem( + value=str(result.lang), + label=str(result.label) + ) + + +def count_results_from_languages_search(query: str) -> int: + sparql_query = prepare_query_for_count_in_language_search(query) + results = query_ontologies_dataset(sparql_query) + return next(results.__iter__()).get("count").toPython() diff --git a/project/dalia/rdf/namespace/lvont.py b/project/dalia/rdf/namespace/lvont.py new file mode 100644 index 0000000000000000000000000000000000000000..5e86f424c575bf0beb49549c39ecd25923ab5e2d --- /dev/null +++ b/project/dalia/rdf/namespace/lvont.py @@ -0,0 +1,9 @@ +""" +Terms from the Lexvo.org Ontology (http://lexvo.org/ontology) +""" +from rdflib import URIRef + +NS = "http://lexvo.org/ontology#" + +# Properties +iso639P1Code = URIRef(NS + "iso639P1Code") diff --git a/project/dalia/urls.py b/project/dalia/urls.py index 50a25eedf877f2fcd6144a9834631f81a13f4cb7..466998bd1de2851f85855e8ddc65edee69c1d260 100644 --- a/project/dalia/urls.py +++ b/project/dalia/urls.py @@ -14,6 +14,7 @@ urlpatterns = [ path('v1/items/<uuid:resource_id>/suggestions', views.ItemSuggestionsView.as_view(), name="item_suggestions"), path('v1/curation/suggest/communities', views.CurationSuggestCommunitiesView.as_view(), name="curation_suggest_communities"), path('v1/curation/suggest/learning-resource-types', views.CurationSuggestLearningResourceTypesView.as_view(), name="curation_suggest_learning_resource_types"), + path('v1/curation/suggest/languages', views.CurationSuggestLanguagesView.as_view(), name="curation_suggest_languages"), path('v1/curation/suggest/disciplines', views.CurationSuggestDisciplinesView.as_view(), name="curation_suggest_disciplines"), path('v1/curation/suggest/licenses', views.CurationSuggestLicensesView.as_view(), name="curation_suggest_licenses"), path('v1/curation/suggest/proficiency-levels', views.CurationSuggestProficiencyLevelsView.as_view(), name="curation_suggest_proficiency_levels"), diff --git a/project/dalia/views.py b/project/dalia/views.py index 02ef5ac7c1e0f12a3e8cf12968cc1b8f41349083..c44f3e957ddda4b7d25c88f1c900b2647b3b419b 100644 --- a/project/dalia/views.py +++ b/project/dalia/views.py @@ -8,6 +8,7 @@ from rest_framework.views import APIView from project.dalia.api_models.api_models import ItemSearchResult from project.dalia.curation.suggest.communities import get_communities_suggestions from project.dalia.curation.suggest.disciplines import get_disciplines_suggestions +from project.dalia.curation.suggest.languages import get_languages_suggestions from project.dalia.curation.suggest.learning_resource_types import get_learning_resource_types_suggestions from project.dalia.curation.suggest.media_types import get_media_types_suggestions from project.dalia.curation.suggest.proficiency_levels import get_proficiency_levels_suggestions @@ -114,6 +115,18 @@ class CurationSuggestLearningResourceTypesView(APIView): return Response(serializer.data) +# endpoint /curation/suggest/languages +class CurationSuggestLanguagesView(APIView): + def get(self, request: Request): + request_serializer = serializers.CurationSuggestSearchRequestSerializer(data=request.query_params) + request_serializer.is_valid(raise_exception=True) + + result_serializer = serializers.CurationSuggestPaginatedResultSerializer( + get_languages_suggestions(request_serializer.validated_data) + ) + return Response(result_serializer.data) + + # endpoint /curation/suggest/disciplines class CurationSuggestDisciplinesView(APIView): def get(self, request: Request): diff --git a/tests/project/dalia/curation/suggest/test_languages.py b/tests/project/dalia/curation/suggest/test_languages.py new file mode 100644 index 0000000000000000000000000000000000000000..5d6632615c76c668952ffa583462c8158f31154a --- /dev/null +++ b/tests/project/dalia/curation/suggest/test_languages.py @@ -0,0 +1,201 @@ +import pytest +from django.urls import reverse +from rest_framework import status +from rest_framework.exceptions import ErrorDetail + +from project.dalia.api_models.api_models import ( + CurationSuggestPaginatedResult, + CurationSuggestSearchRequest, + LabelValueItem, +) +from project.dalia.curation.suggest.languages import ( + count_results_from_languages_search, + get_languages_suggestions, + prepare_query_for_count_in_language_search, + prepare_query_for_language_search_and_label_retrieval, +) +from project.dalia.serializers import CurationSuggestPaginatedResultSerializer +from tests.project.dalia.utils import dedent_and_normalize, normalize + + +def test_prepare_query_for_language_search_and_label_retrieval(): + query = prepare_query_for_language_search_and_label_retrieval( + query="fr*", + limit=10, + offset=42 + ) + + assert normalize(query) == dedent_and_normalize(""" + SELECT ?lang ?label + WHERE { + ( ?lang ?score ?label ) <http://jena.apache.org/text#query> ( <http://www.w3.org/2008/05/skos#prefLabel> "fr*" ) . + FILTER ( LANG ( ?label ) = "en" ) . + OPTIONAL { + ?lang <http://lexvo.org/ontology#iso639P1Code> ?iso639P1Code . + } . + BIND ( COALESCE ( ?iso639P1Code, "false"^^<http://www.w3.org/2001/XMLSchema#boolean> ) AS ?boundIso639P1Code ) . + } + ORDER BY DESC ( ?score ) ?boundIso639P1Code + LIMIT 10 + OFFSET 42 + """) + + +def test_prepare_query_for_count_in_language_search(): + query = prepare_query_for_count_in_language_search(query="fr*") + + assert normalize(query) == dedent_and_normalize(""" + SELECT (COUNT( ?lang ) as ?count) + WHERE { + ( ?lang ?score ?label ) <http://jena.apache.org/text#query> ( <http://www.w3.org/2008/05/skos#prefLabel> "fr*" ) . + FILTER ( LANG ( ?label ) = "en" ) . + } + """) + + +def test_get_languages_suggestions(triplestore): + request = CurationSuggestSearchRequest(q="en", limit=4, offset=13) + + result = get_languages_suggestions(request) + + assert result.count == 38 + assert result.limit == 4 + assert result.offset == 13 + + result_values = list(map(lambda result_item: result_item.value, result.results)) + assert result_values == [ + "http://lexvo.org/id/iso639-3/ptt", + "http://lexvo.org/id/iso639-3/enu", + "http://lexvo.org/id/iso639-3/enw", + "http://lexvo.org/id/iso639-3/env", + ] + + result_labels = list(map(lambda result_item: result_item.label, result.results)) + assert result_labels == [ + "Enrekang", + "Enu", + "Enwan (Akwa Ibom State)", + "Enwan (Edu State)", + ] + + +@pytest.mark.parametrize( + "query, expected_count", + [ + ("*", 7771), + ("germ*", 11), + ("KLIN*", 1), + ] +) +def test_count_results_from_languages_search(triplestore, query, expected_count): + assert count_results_from_languages_search(query) == expected_count + + +@pytest.mark.parametrize( + "request_data, expected_response_data", + [ + ( + {"limit": 2}, + CurationSuggestPaginatedResult( + count=7771, + offset=0, + limit=2, + results=[ + LabelValueItem(label="Afar", value="http://lexvo.org/id/iso639-3/aar"), + LabelValueItem(label="Abkhazian", value="http://lexvo.org/id/iso639-3/abk"), + ] + ) + ), + ( + {"q": "Klin"}, + CurationSuggestPaginatedResult( + count=1, + offset=0, + limit=10, + results=[ + LabelValueItem(label="Klingon",value="http://lexvo.org/id/iso639-3/tlh"), + ] + ) + ), + ( + {"q": "fr", "limit": 4}, + CurationSuggestPaginatedResult( + count=26, + offset=0, + limit=4, + results=[ + LabelValueItem(label="French", value="http://lexvo.org/id/iso639-3/fra"), + LabelValueItem(label="Western Frisian", value="http://lexvo.org/id/iso639-3/fry"), + LabelValueItem(label="Cajun French", value="http://lexvo.org/id/iso639-3/frc"), + LabelValueItem(label="Eastern Frisian", value="http://lexvo.org/id/iso639-3/frs"), + ] + ) + ), + ( + {"q": "fre", "limit": 1, "offset": 10}, + CurationSuggestPaginatedResult( + count=14, + offset=10, + limit=1, + results=[ + LabelValueItem(label="Saint Lucian Creole French", value="http://lexvo.org/id/iso639-3/acf"), + ] + ) + ), + ( + {"q": "abc", "limit": 100}, + CurationSuggestPaginatedResult( + count=0, + offset=0, + limit=100, + results=[] + ) + ), + ] +) +def test_get_on_CurationSuggestLanguagesView_returns_200_and_valid_data( + triplestore, api_client, request_data, expected_response_data +): + response = api_client.get(reverse("curation_suggest_languages"), data=request_data) + + assert response.status_code == status.HTTP_200_OK + + serializer = CurationSuggestPaginatedResultSerializer(data=response.data) + + assert serializer.is_valid() + data = serializer.validated_data + + assert data == expected_response_data + + +@pytest.mark.parametrize( + "request_data, expected_error_response", + [ + ( + {"limit": 0}, + { + 'limit': [ErrorDetail(string='Ensure this value is greater than or equal to 1.', code='min_value')], + }, + ), + ( + {"offset": -1}, + { + 'offset': [ErrorDetail(string='Ensure this value is greater than or equal to 0.', code='min_value')], + }, + ), + ( + {"limit": 0, "offset": -1}, + { + 'limit': [ErrorDetail(string='Ensure this value is greater than or equal to 1.', code='min_value')], + 'offset': [ErrorDetail(string='Ensure this value is greater than or equal to 0.', code='min_value')], + }, + ), + ] +) +def test_get_on_CurationSuggestLanguagesView_returns_400_for_invalid_request_data( + api_client, request_data, expected_error_response +): + response = api_client.get(reverse("curation_suggest_languages"), data=request_data) + + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert response.data == expected_error_response