Skip to content
Snippets Groups Projects
Commit c40b3ff7 authored by Frank Lange's avatar Frank Lange
Browse files

add endpoint /suggest/licenses

parent b57e6131
Branches
Tags
No related merge requests found
Pipeline #1524009 passed
......@@ -217,6 +217,13 @@ class Event(BaseItem, EventSpecial):
Item = Union[Event, Resource]
@dataclass
class CurationSuggestSearchRequest:
q: str = field(default="", metadata={"serializer_kwargs": {"allow_blank": True}})
limit: int = field(default=10, metadata={"serializer_kwargs": {"min_value": 1}})
offset: int = field(default=0, metadata={"serializer_kwargs": {"min_value": 0}})
@dataclass
class CurationSuggestResultItem:
label: str
......@@ -226,3 +233,22 @@ class CurationSuggestResultItem:
@dataclass
class CurationSuggestDisciplinesResultItem(CurationSuggestResultItem):
children: List[CurationSuggestDisciplinesResultItem]
@dataclass
class CurationSuggestLicensesRequest(CurationSuggestSearchRequest):
filter: Literal["recommended", "all", "recent"] = "all"
@dataclass
class CurationSuggestLicensesResultItem:
value: Optional[str] = None
licenseId: Optional[str] = None
licenseName: Optional[str] = None
licenseLink: Optional[str] = None
licenseDescription: Optional[str] = None
@dataclass
class CurationSuggestLicensesResult(PaginatedResult):
results: List[CurationSuggestLicensesResultItem] = None
from typing import List
from rdflib import BNode, Graph, Literal, RDF, Variable
from rdflib.collection import Collection
from project.dalia.api_models.api_models import (
CurationSuggestLicensesRequest,
CurationSuggestLicensesResult,
CurationSuggestLicensesResultItem,
)
from project.dalia.query.utils import query_ontologies_dataset
from project.dalia.query_builder.query_builder import (
Aggregates,
BIND,
FILTER_EXISTS,
FunctionExpressions,
OPTIONAL,
QueryBuilder,
)
from project.dalia.rdf.namespace import Dalia_text, Jena_text, spdx
# data for endpoint /suggest/licenses
def get_licenses_suggestions(request: CurationSuggestLicensesRequest) -> CurationSuggestLicensesResult:
license_filter = request.filter
# TODO: replace with match-case statement (Python 3.10 and above)
if license_filter == "recommended":
return _get_recommended_licenses(request)
elif license_filter == "all":
return search_all_licenses(request)
elif license_filter == "recent":
return _get_recently_used_licenses(request)
_VARIABLES = {
"license": Variable("license"),
"licenseId": Variable("licenseId"),
"licenseName": Variable("licenseName"),
"licenseLink": Variable("licenseLink"),
"licenseDescription": Variable("licenseDescription"),
}
def _where_for_text_search(query: str, var_license: Variable, var_score: Variable):
subject_list_for_text_search = Collection(Graph(), BNode(), [var_license, var_score])
object_list_for_text_search = Collection(Graph(), BNode(), [Dalia_text.spdxLicensesTexts, Literal(query)])
where = [
(subject_list_for_text_search, Jena_text.query, object_list_for_text_search),
(var_license, RDF.type, spdx.ListedLicense),
FILTER_EXISTS(
(var_license, spdx.isDeprecatedLicenseId, Literal(True)),
state=False
)
]
return tuple(where)
def prepare_query_for_license_search_and_metadata_retrieval(query: str, limit: int, offset: int) -> str:
var_license = _VARIABLES["license"]
var_score = Variable("score")
var_license_link_from_crosslink = Variable("licenseLinkFromCrosslink")
var_crossref = Variable("crossref")
var_order = Variable("order")
return QueryBuilder().SELECT(
*_VARIABLES.values()
).WHERE(
QueryBuilder().SELECT(
var_license,
distinct=True
).WHERE(
*_where_for_text_search(query, var_license, var_score)
).ORDER_BY(
FunctionExpressions.DESC(var_score)
).LIMIT(limit).OFFSET(offset).build(),
OPTIONAL((var_license, spdx.licenseId, _VARIABLES["licenseId"])),
OPTIONAL((var_license, spdx.name, _VARIABLES["licenseName"])),
OPTIONAL(
QueryBuilder().SELECT(
var_license,
var_license_link_from_crosslink
).WHERE(
(var_license, spdx.crossRef_P, var_crossref),
(var_crossref, RDF.type, spdx.CrossRef_T),
(var_crossref, spdx.order, var_order),
(var_crossref, spdx.url, var_license_link_from_crosslink),
).ORDER_BY(
FunctionExpressions.ASC(var_order)
).LIMIT(1).build(),
# This BIND is needed for Apache Jena's SPARQL engine to bind ?licenseLink outside the OPTIONAL. Without
# the LIMIT clause ?licenseLink seems to become bound even without the BIND (a bug???). Other SPARQL engines
# don't seem to require this workaround.
BIND(var_license_link_from_crosslink, _VARIABLES["licenseLink"]),
),
OPTIONAL((var_license, spdx.licenseText, _VARIABLES["licenseDescription"])),
).build()
def prepare_query_for_count_in_license_search(query: str) -> str:
var_license = _VARIABLES["license"]
var_score = Variable("score")
return QueryBuilder().SELECT(
count=Aggregates("COUNT", var_license, ["DISTINCT"]),
).WHERE(
*_where_for_text_search(query, var_license, var_score)
).build()
def search_all_licenses(request: CurationSuggestLicensesRequest) -> CurationSuggestLicensesResult:
# Simply appending "*" to all queries from users may completely break the Lucene search. For instance the query
# "CC-BY" would become "CC-BY*", which returns no results for whatever reason.
# Hyphens are treated as whitespaces by Lucene's StandardAnalyzer (see https://stackoverflow.com/a/10187140), thus
# "CC-BY" would become "CC BY" (or more precisely "CC OR BY"). Such a text query returns many results (maybe too
# many?), but the scoring is reasonable to achieve an acceptable user experience.
query = request.q or "*"
limit = request.limit
offset = request.offset
return CurationSuggestLicensesResult(
count=count_results_from_license_search(query),
offset=offset,
limit=limit,
results=_search_licenses_and_retrieve_metadata(query, limit, offset)
)
def _search_licenses_and_retrieve_metadata(
query: str, limit: int, offset: int
) -> List[CurationSuggestLicensesResultItem]:
sparql_query = prepare_query_for_license_search_and_metadata_retrieval(query, limit, offset)
results = query_ontologies_dataset(sparql_query)
return [_process_result_from_metadata_retrieval(result) for result in results]
def _process_result_from_metadata_retrieval(result) -> CurationSuggestLicensesResultItem:
result_item = CurationSuggestLicensesResultItem()
result_item.value = str(result.license)
result_item.licenseId = str(result.licenseId) if result.licenseId else ""
result_item.licenseName = str(result.licenseName) if result.licenseName else ""
result_item.licenseLink = str(result.licenseLink) if result.licenseLink else ""
result_item.licenseDescription = str(result.licenseDescription) if result.licenseDescription else ""
return result_item
def count_results_from_license_search(query: str) -> int:
sparql_query = prepare_query_for_count_in_license_search(query)
results = query_ontologies_dataset(sparql_query)
return next(results.__iter__()).get("count").toPython()
def _get_recommended_licenses(request: CurationSuggestLicensesRequest) -> CurationSuggestLicensesResult:
return CurationSuggestLicensesResult(
count=0,
offset=request.offset,
limit=request.limit,
results=[]
)
def _get_recently_used_licenses(request: CurationSuggestLicensesRequest) -> CurationSuggestLicensesResult:
return CurationSuggestLicensesResult(
count=0,
offset=request.offset,
limit=request.limit,
results=[]
)
......@@ -7,3 +7,4 @@ NS = "http://dalia.education/text#"
# Properties
learningResourceTexts = URIRef(NS + "learningResourceTexts")
spdxLicensesTexts = URIRef(NS + "spdxLicensesTexts")
......@@ -7,7 +7,9 @@ NS = "http://spdx.org/rdf/terms#"
# Properties
crossRef_P = URIRef(NS + "crossRef")
isDeprecatedLicenseId = URIRef(NS + "isDeprecatedLicenseId")
licenseId = URIRef(NS + "licenseId")
licenseText = URIRef(NS + "licenseText")
name = URIRef(NS + "name")
order = URIRef(NS + "order")
url = URIRef(NS + "url")
......
......@@ -4,6 +4,8 @@ from project.dalia.api_models.api_models import (
BasicSearchFilter,
Community,
CurationSuggestDisciplinesResultItem,
CurationSuggestLicensesRequest,
CurationSuggestLicensesResult,
ItemSearchRequest,
ItemSearchResult,
Resource,
......@@ -38,3 +40,13 @@ class ItemSearchRequestSerializer(DataclassSerializer):
class CurationSuggestDisciplinesResultItemSerializer(DataclassSerializer):
class Meta:
dataclass = CurationSuggestDisciplinesResultItem
class CurationSuggestLicensesRequestSerializer(DataclassSerializer):
class Meta:
dataclass = CurationSuggestLicensesRequest
class CurationSuggestLicensesResultSerializer(DataclassSerializer):
class Meta:
dataclass = CurationSuggestLicensesResult
......@@ -13,4 +13,5 @@ urlpatterns = [
path('v1/items', views.ItemSearchView.as_view(), name="dalia_item_search"),
path('v1/items/<uuid:resource_id>/suggestions', views.ItemSuggestionsView.as_view(), name="item_suggestions"),
path('v1/suggest/disciplines', views.CurationSuggestDisciplinesView.as_view(), name="curation_suggest_disciplines"),
path('v1/suggest/licenses', views.CurationSuggestLicensesView.as_view(), name="curation_suggest_licenses"),
]
......@@ -7,6 +7,7 @@ from rest_framework.views import APIView
from project.dalia.api_models.api_models import ItemSearchResult
from project.dalia.curation.suggest.disciplines import get_disciplines_suggestions
from project.dalia.curation.suggest.licenses import get_licenses_suggestions
from project.dalia.query.communities.communities import get_metadata_for_community
from project.dalia.query.communities.community_items import get_items_for_community
from project.dalia.query.items.basic_search_filters.basic_search_filters import get_basic_search_filters
......@@ -16,6 +17,8 @@ from project.dalia.serializers import (
BasicSearchFilterSerializer,
CommunitySerializer,
CurationSuggestDisciplinesResultItemSerializer,
CurationSuggestLicensesRequestSerializer,
CurationSuggestLicensesResultSerializer,
ItemSearchRequestSerializer,
ItemSearchResultSerializer,
ItemSerializer,
......@@ -80,7 +83,7 @@ class ItemSearchView(APIView):
# endpoint /items/{itemId}/suggestions
class ItemSuggestionsView(APIView):
def get(self, request: Request, resource_id: UUID) -> HttpResponse:
data = request.GET
data = request.query_params
search_result = ItemSearchResult()
search_result.count = 0
......@@ -98,3 +101,15 @@ class CurationSuggestDisciplinesView(APIView):
def get(self, request: Request):
serializer = CurationSuggestDisciplinesResultItemSerializer(get_disciplines_suggestions(), many=True)
return Response(serializer.data)
# endpoint /suggest/licenses
class CurationSuggestLicensesView(APIView):
def get(self, request: Request):
request_serializer = CurationSuggestLicensesRequestSerializer(data=request.query_params)
request_serializer.is_valid(raise_exception=True)
result_serializer = CurationSuggestLicensesResultSerializer(
get_licenses_suggestions(request_serializer.validated_data)
)
return Response(result_serializer.data)
import pytest
from django.urls import reverse
from rest_framework import status
from rest_framework.exceptions import ErrorDetail
from project.dalia.api_models.api_models import (
CurationSuggestLicensesRequest,
CurationSuggestLicensesResult,
CurationSuggestLicensesResultItem,
)
from project.dalia.curation.suggest.licenses import (
count_results_from_license_search,
prepare_query_for_count_in_license_search,
prepare_query_for_license_search_and_metadata_retrieval,
search_all_licenses,
)
from project.dalia.serializers import CurationSuggestLicensesResultSerializer
from tests.project.dalia.utils import dedent_and_normalize, normalize
def test_prepare_query_for_license_search_and_metadata_retrieval():
query = prepare_query_for_license_search_and_metadata_retrieval(
query="abc",
limit=10,
offset=20
)
assert normalize(query) == dedent_and_normalize("""
SELECT ?license ?licenseId ?licenseName ?licenseLink ?licenseDescription
WHERE {
{
SELECT DISTINCT ?license
WHERE {
( ?license ?score ) <http://jena.apache.org/text#query> ( <http://dalia.education/text#spdxLicensesTexts> "abc" ) .
?license <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://spdx.org/rdf/terms#ListedLicense> .
FILTER NOT EXISTS {
?license <http://spdx.org/rdf/terms#isDeprecatedLicenseId> "true"^^<http://www.w3.org/2001/XMLSchema#boolean> .
} .
}
ORDER BY DESC ( ?score )
LIMIT 10
OFFSET 20
}
OPTIONAL {
?license <http://spdx.org/rdf/terms#licenseId> ?licenseId .
} .
OPTIONAL {
?license <http://spdx.org/rdf/terms#name> ?licenseName .
} .
OPTIONAL {
{
SELECT ?license ?licenseLinkFromCrosslink
WHERE {
?license <http://spdx.org/rdf/terms#crossRef> ?crossref .
?crossref <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://spdx.org/rdf/terms#CrossRef> .
?crossref <http://spdx.org/rdf/terms#order> ?order .
?crossref <http://spdx.org/rdf/terms#url> ?licenseLinkFromCrosslink .
}
ORDER BY ASC ( ?order )
LIMIT 1
}
BIND ( ?licenseLinkFromCrosslink AS ?licenseLink ) .
} .
OPTIONAL {
?license <http://spdx.org/rdf/terms#licenseText> ?licenseDescription .
} .
}
""")
def test_prepare_query_for_count_in_license_search():
query = prepare_query_for_count_in_license_search(query="abc")
assert normalize(query) == dedent_and_normalize("""
SELECT (COUNT( DISTINCT ?license ) as ?count)
WHERE {
( ?license ?score ) <http://jena.apache.org/text#query> ( <http://dalia.education/text#spdxLicensesTexts> "abc" ) .
?license <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://spdx.org/rdf/terms#ListedLicense> .
FILTER NOT EXISTS {
?license <http://spdx.org/rdf/terms#isDeprecatedLicenseId> "true"^^<http://www.w3.org/2001/XMLSchema#boolean> .
} .
}
""")
def test_search_all_licenses(triplestore):
request = CurationSuggestLicensesRequest(q="CC-BY", limit=5, offset=50)
result = search_all_licenses(request)
assert result.count == 481
assert result.limit == 5
assert result.offset == 50
license_values = list(map(lambda result_item: result_item.value, result.results))
assert license_values == [
"http://spdx.org/licenses/CC-BY-SA-3.0-DE",
"http://spdx.org/licenses/CC-BY-SA-3.0-IGO",
"http://spdx.org/licenses/EUPL-1.2",
"http://spdx.org/licenses/Community-Spec-1.0",
"http://spdx.org/licenses/CC-BY-NC-ND-3.0-DE",
]
license_ids = list(map(lambda result_item: result_item.licenseId, result.results))
assert license_ids == [
"CC-BY-SA-3.0-DE",
"CC-BY-SA-3.0-IGO",
"EUPL-1.2",
"Community-Spec-1.0",
"CC-BY-NC-ND-3.0-DE",
]
license_names = list(map(lambda result_item: result_item.licenseName, result.results))
assert license_names == [
"Creative Commons Attribution Share Alike 3.0 Germany",
"Creative Commons Attribution-ShareAlike 3.0 IGO",
"European Union Public License 1.2",
"Community Specification License 1.0",
"Creative Commons Attribution Non Commercial No Derivatives 3.0 Germany",
]
license_links = list(map(lambda result_item: result_item.licenseLink, result.results))
assert license_links == [
"https://creativecommons.org/licenses/by-sa/3.0/de/legalcode",
"https://creativecommons.org/licenses/by-sa/3.0/igo/legalcode",
"https://joinup.ec.europa.eu/page/eupl-text-11-12",
"https://github.com/CommunitySpecification/1.0/blob/master/1._Community_Specification_License-v1.md",
"https://creativecommons.org/licenses/by-nc-nd/3.0/de/legalcode",
]
license_descriptions = list(map(lambda result_item: result_item.licenseDescription, result.results))
expected_license_descriptions_start = [
"Creative Commons Namensnennung - Weitergabe unter gleichen Bedingungen 3.0 Deutschland\n\n CREATIVE COMMONS",
"Creative Commons Attribution-ShareAlike 3.0 IGO\n\nCREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES",
"EUROPEAN UNION PUBLIC LICENCE v. 1.2\nEUPL © the European Union 2007, 2016\n\nThis European Union Public",
"Community Specification License 1.0\n\nThe Purpose of this License. This License sets forth the terms under",
"Creative Commons Namensnennung - Keine kommerzielle Nutzung - Keine Bearbeitungen 3.0 Deutschland\n\n",
]
for index, license_description in enumerate(license_descriptions):
assert license_description.startswith(expected_license_descriptions_start[index])
@pytest.mark.parametrize(
"query, expected_count",
[
("*", 606),
("CC", 62),
("CC-BY-SA", 486),
]
)
def test_count_results_from_license_search(
triplestore, query, expected_count
):
assert count_results_from_license_search(query) == expected_count
@pytest.mark.parametrize(
"request_data, expected_response_data",
[
(
{"filter": "recent"},
CurationSuggestLicensesResult(
count=0,
offset=0,
limit=10,
results=[]
)
),
(
{"filter": "all", "q": "MIT", "limit": 1, "offset": 15},
CurationSuggestLicensesResult(
count=34,
offset=15,
limit=1,
results=[
CurationSuggestLicensesResultItem(
value='http://spdx.org/licenses/MIT',
licenseId='MIT',
licenseName='MIT License',
licenseLink='https://opensource.org/license/mit/',
licenseDescription='MIT License\n\nCopyright (c) <year> <copyright holders>\n\nPermission '
'is hereby granted, free of charge, to any person obtaining a copy of '
'this software and associated documentation files (the "Software"), '
'to deal in the Software without restriction, including without '
'limitation the rights to use, copy, modify, merge, publish, '
'distribute, sublicense, and/or sell copies of the Software, and to '
'permit persons to whom the Software is furnished to do so, subject to '
'the following conditions:\n\nThe above copyright notice and this '
'permission notice shall be included in all copies or substantial '
'portions of the Software.\n\nTHE SOFTWARE IS PROVIDED "AS IS", WITHOUT '
'WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO '
'THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE '
'AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT '
'HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER '
'IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR '
'IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE '
'SOFTWARE.'
),
]
)
),
(
{"filter": "all", "q": "abc", "limit": 100},
CurationSuggestLicensesResult(
count=0,
offset=0,
limit=100,
results=[]
)
),
(
{"filter": "recommended"},
CurationSuggestLicensesResult(
count=0,
offset=0,
limit=10,
results=[]
)
),
]
)
def test_get_on_CurationSuggestLicensesView_returns_200_and_valid_data(
triplestore, api_client, request_data, expected_response_data
):
response = api_client.get(reverse("curation_suggest_licenses"), data=request_data)
assert response.status_code == status.HTTP_200_OK
serializer = CurationSuggestLicensesResultSerializer(data=response.data)
assert serializer.is_valid()
data = serializer.validated_data
assert data == expected_response_data
@pytest.mark.parametrize(
"request_data, expected_error_response",
[
(
{"filter": "invalid filter"},
{'filter': [ErrorDetail(string='"invalid filter" is not a valid choice.', code='invalid_choice')]},
),
(
{"limit": 0},
{'limit': [ErrorDetail(string='Ensure this value is greater than or equal to 1.', code='min_value')]},
),
(
{"offset": -1},
{'offset': [ErrorDetail(string='Ensure this value is greater than or equal to 0.', code='min_value')]},
),
(
{"filter": "invalid filter", "limit": 0, "offset": -1},
{
'limit': [ErrorDetail(string='Ensure this value is greater than or equal to 1.', code='min_value')],
'offset': [
ErrorDetail(string='Ensure this value is greater than or equal to 0.', code='min_value')
],
'filter': [ErrorDetail(string='"invalid filter" is not a valid choice.', code='invalid_choice')],
},
),
]
)
def test_get_on_CurationSuggestLicensesView_returns_400_for_invalid_request_data(
api_client, request_data, expected_error_response
):
response = api_client.get(reverse("curation_suggest_licenses"), data=request_data)
assert response.status_code == status.HTTP_400_BAD_REQUEST
assert response.data == expected_error_response
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment