Skip to content
Snippets Groups Projects

Fix: Optimize Queries

Merged Benedikt Heinrichs requested to merge Issue/1792-newMetadataStructure into dev
Files
5
@@ -6,12 +6,12 @@ using Coscine.ResourceTypes.Base;
using Coscine.ResourceTypes.Base.Models;
using Org.OpenAPITools.Api;
using Org.OpenAPITools.Model;
using VDS.RDF.Query;
using VDS.RDF;
using MetadataExtractorCron.Util;
using VDS.RDF.Parsing;
using System.Globalization;
using System.Security.Cryptography;
using Coscine.Metadata.Util;
namespace MetadataExtractorCron.Extractors;
@@ -25,14 +25,6 @@ public class CoscineMetadataExtractor : IMetadataExtractor
private readonly RdfStoreConnector _rdfStoreConnector;
private readonly MetadataGraphsCreator _metadataGraphsCreator;
private const string metadataExtractionVersionUrl = "https://purl.org/coscine/terms/metatadataextraction#version";
private const string dcatdistributionUrl = "http://www.w3.org/ns/dcat#distribution";
private const string partOfUri = "http://purl.org/dc/terms/isPartOf";
private const string aUri = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
private const string dctermsModifiedUri = "http://purl.org/dc/terms/modified";
private const string rdfSourceUri = "http://www.w3.org/ns/ldp#RDFSource";
private const string trellisGraphUri = "http://www.trellisldp.org/ns/trellis#PreferServerManaged";
public CoscineMetadataExtractor()
{
_configuration = new ConsulConfiguration();
@@ -72,7 +64,7 @@ public class CoscineMetadataExtractor : IMetadataExtractor
foreach (var file in fileInfos.Where((fileInfo) => fileInfo.HasBody))
{
if (file.BodyBytes > 16 * 1000 * 1000)
if (file.BodyBytes > VersionUtil.DetectionByteLimit)
{
Console.WriteLine($"Skipping {file.Key} on {resourceId} since it has a too large byte size");
continue;
@@ -114,63 +106,31 @@ public class CoscineMetadataExtractor : IMetadataExtractor
private void CreateMetadataSetsIfDontExist(string resourceId, ResourceEntry entry, IEnumerable<ResourceEntry> fileInfos)
{
var resourceGraphName = $"{_resourceUrlPrefix}/{resourceId}";
var newFileGraphName = $"{resourceGraphName}/{entry.Key}";
if (!newFileGraphName.EndsWith("/"))
{
newFileGraphName += "/";
}
var existingGraphs = ListGraphs(newFileGraphName);
var existingGraphs = _rdfStoreConnector.GetMetadataIds(resourceId, entry.Key);
if (!existingGraphs.Any())
{
Console.WriteLine($"Creating graphs for {newFileGraphName} since they did not exist before!");
_metadataGraphsCreator.CreateGraphs(resourceId, entry, fileInfos);
}
}
private IEnumerable<Uri> ListGraphs(string id)
{
var cmdString = new SparqlParameterizedString
{
CommandText = @"SELECT DISTINCT ?g
WHERE { GRAPH ?g { ?s ?p ?o }
FILTER(contains(str(?g), @graph)) }"
};
cmdString.SetLiteral("graph", id);
var resultSet = _rdfStoreConnector.QueryEndpoint.QueryWithResultSet(cmdString.ToString());
var graphs = new List<Uri>();
foreach (SparqlResult r in resultSet)
{
var uriNode = r.Value("g") as UriNode;
if (uriNode is not null)
{
graphs.Add(uriNode.Uri);
}
Console.WriteLine($"Creating graphs for {resourceId}, {entry.Key} since they did not exist before!");
GraphStorer.StoreGraphs(_metadataGraphsCreator.CreateGraphs(
resourceId,
entry.Key,
true,
true
), _rdfStoreConnector);
}
return graphs;
}
private bool HasCurrentMetadataExtracted(string resourceId, ResourceEntry entry)
{
var resourceGraphName = $"{_resourceUrlPrefix}/{resourceId}";
var newFileGraphName = $"{resourceGraphName}/{entry.Key}";
if (!newFileGraphName.EndsWith("/"))
{
newFileGraphName += "/";
}
var existingGraphs = ListGraphs(newFileGraphName);
var existingGraphs = _rdfStoreConnector.GetDataIds(resourceId, entry.Key);
var existingExtractedGraphs = _rdfStoreConnector.GetDataIds(resourceId, entry.Key, true);
var recentDataVersion = VersionUtil.GetRecentDataVersion(existingGraphs);
var recentDataExtractedVersion = VersionUtil.GetRecentDataExtractedVersion(existingGraphs);
var recentDataExtractedVersion = VersionUtil.GetRecentDataExtractedVersion(existingExtractedGraphs);
return
recentDataExtractedVersion != null
&& recentDataVersion != null
&& recentDataExtractedVersion.AbsoluteUri.Contains(recentDataVersion.AbsoluteUri)
&& recentDataExtractedVersion.AbsoluteUri != recentDataVersion.AbsoluteUri;
&& recentDataExtractedVersion.Contains(recentDataVersion)
&& recentDataExtractedVersion != recentDataVersion;
}
private async Task<MetadataOutput> ExtractMetadata(string resourceId, ResourceEntry entry, BaseResourceType resourceTypeDefinition, Dictionary<string, string>? resourceTypeOptions)
@@ -195,9 +155,9 @@ public class CoscineMetadataExtractor : IMetadataExtractor
var extractedOutputs = await _apiClient.PostMetadataExtractorWorkerAsync(
givenStream,
$"{resourceId}/{entry.Key.Replace("\\", "/")}",
null,
entry.Created?.ToString("o", CultureInfo.InvariantCulture),
entry.Modified?.ToString("o", CultureInfo.InvariantCulture)
null!,
entry.Created?.ToString("o", CultureInfo.InvariantCulture)!,
entry.Modified?.ToString("o", CultureInfo.InvariantCulture)!
);
return extractedOutputs[0];
@@ -221,9 +181,8 @@ public class CoscineMetadataExtractor : IMetadataExtractor
newFileGraphNameAddon += "/";
}
var existingGraphs = ListGraphs(newFileGraphNameAddon);
var recentDataVersion = VersionUtil.GetRecentDataVersion(existingGraphs);
var recentMetadataVersion = VersionUtil.GetRecentMetadataVersion(existingGraphs);
var recentDataVersion = _rdfStoreConnector.GetDataId(resourceId, entry.Key);
var recentMetadataVersion = _rdfStoreConnector.GetMetadataId(resourceId, entry.Key);
await CreateHashData(resourceId, entry, resourceTypeDefinition, resourceTypeOptions, newFileGraphNameAddon, recentDataVersion);
@@ -232,14 +191,14 @@ public class CoscineMetadataExtractor : IMetadataExtractor
throw new NullReferenceException("The recent data version is null and can't be used.");
}
var recentDataExtractedVersion = new Uri(recentDataVersion.AbsoluteUri + "&extracted=true");
var recentDataExtractedVersion = new Uri(recentDataVersion + "&extracted=true");
if (recentMetadataVersion is null)
{
throw new NullReferenceException("The recent metadata version is null and can't be used.");
}
var recentMetadataExtractedVersion = new Uri(recentMetadataVersion.AbsoluteUri + "&extracted=true");
var recentMetadataExtractedVersion = new Uri(recentMetadataVersion + "&extracted=true");
var tripleStore = new TripleStore();
tripleStore.LoadFromString(extractedMetadata.Metadata, new TriGParser(TriGSyntax.Recommendation));
@@ -248,57 +207,22 @@ public class CoscineMetadataExtractor : IMetadataExtractor
GraphStorer.StoreGraphs(tripleStore.Graphs, _rdfStoreConnector);
var trellisGraph = _rdfStoreConnector.GetGraph(trellisGraphUri);
var triples = new List<Triple>();
AddToTrellis(trellisGraph, rdfSourceUri, newFileGraphName, recentDataExtractedVersion.AbsoluteUri, triples);
AddToTrellis(trellisGraph, rdfSourceUri, newFileGraphName, recentMetadataExtractedVersion.AbsoluteUri, triples);
GraphStorer.AddToGraph(trellisGraph, triples, _rdfStoreConnector);
var newDataFileGraphName = $"{newFileGraphName}/@type=data";
var newMetadataFileGraphName = $"{newFileGraphName}/@type=metadata";
var dataGraph = CreateOrGetGraph(newDataFileGraphName);
var metadataGraph = CreateOrGetGraph(newMetadataFileGraphName);
dataGraph.Assert(new Triple(
dataGraph.CreateUriNode(new Uri(newDataFileGraphName)),
dataGraph.CreateUriNode(new Uri(dcatdistributionUrl)),
dataGraph.CreateUriNode(recentDataExtractedVersion)
));
dataGraph.Assert(new Triple(
dataGraph.CreateUriNode(recentDataExtractedVersion),
dataGraph.CreateUriNode(new Uri(metadataExtractionVersionUrl)),
dataGraph.CreateLiteralNode(metadataExtractorVersion)
));
metadataGraph.Assert(new Triple(
metadataGraph.CreateUriNode(new Uri(newMetadataFileGraphName)),
metadataGraph.CreateUriNode(new Uri(dcatdistributionUrl)),
metadataGraph.CreateUriNode(recentMetadataExtractedVersion)
));
metadataGraph.Assert(new Triple(
metadataGraph.CreateUriNode(recentMetadataExtractedVersion),
metadataGraph.CreateUriNode(new Uri(metadataExtractionVersionUrl)),
metadataGraph.CreateLiteralNode(metadataExtractorVersion)
));
metadataGraph.Assert(new Triple(
metadataGraph.CreateUriNode(recentMetadataVersion),
metadataGraph.CreateUriNode(new Uri("http://purl.org/fdp/fdp-o#isMetadataOf")),
metadataGraph.CreateUriNode(recentDataVersion)
));
var provenanceGraphs = new List<IGraph> { dataGraph, metadataGraph };
GraphStorer.StoreGraphs(provenanceGraphs, _rdfStoreConnector);
GraphStorer.StoreGraphs(
_metadataGraphsCreator.UpdateExtractionGraphs(
resourceId,
entry.Key,
recentDataVersion,
recentMetadataVersion,
metadataExtractorVersion
),
_rdfStoreConnector);
}
private async Task CreateHashData(string resourceId, ResourceEntry entry, BaseResourceType resourceTypeDefinition, Dictionary<string, string>? resourceTypeOptions, string newFileGraphNameAddon, Uri? recentDataVersion)
private async Task CreateHashData(string resourceId, ResourceEntry entry, BaseResourceType resourceTypeDefinition, Dictionary<string, string>? resourceTypeOptions, string newFileGraphNameAddon, string? recentDataVersion)
{
var dataGraphName = $"{newFileGraphNameAddon}@type=data";
var dataGraph = CreateOrGetGraph(dataGraphName);
var hashTriples = new List<Triple>();
var loadedEntry = await resourceTypeDefinition.LoadEntry(resourceId, entry.Key, resourceTypeOptions);
if (loadedEntry is null)
@@ -306,25 +230,16 @@ public class CoscineMetadataExtractor : IMetadataExtractor
throw new NullReferenceException("The resulting stream of the loaded entry is null, when trying to hash the data.");
}
var sha512Hash = Convert.ToBase64String(HashUtil.HashData(loadedEntry, HashAlgorithmName.SHA512));
var dataGraphId = recentDataVersion;
var hashGraphId = new Uri($"{dataGraphId?.AbsoluteUri}&hash={Guid.NewGuid()}");
var dataGraphSubject = dataGraph.CreateUriNode(dataGraphId);
var hashSubject = dataGraph.CreateUriNode(hashGraphId);
var defaultHash = Convert.ToBase64String(HashUtil.HashData(loadedEntry));
hashTriples.Add(new Triple(dataGraphSubject,
dataGraph.CreateUriNode(new Uri("http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#hashType")),
hashSubject));
hashTriples.Add(new Triple(hashSubject,
dataGraph.CreateUriNode(new Uri("http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#hashFunction")),
dataGraph.CreateLiteralNode("SHA512")));
hashTriples.Add(new Triple(hashSubject,
dataGraph.CreateUriNode(new Uri("http://www.ebu.ch/metadata/ontologies/ebucore/ebucore#hashValue")),
dataGraph.CreateLiteralNode(sha512Hash, new Uri("http://www.w3.org/2001/XMLSchema#hexBinary"))));
if (recentDataVersion is null)
{
return;
}
GraphStorer.AddToGraph(dataGraph, hashTriples, _rdfStoreConnector);
GraphStorer.AddToGraph(dataGraph, HashUtil.CreateHashTriples(
dataGraph, new Uri(recentDataVersion), defaultHash
), _rdfStoreConnector);
}
private static void FormatResultMetadata(TripleStore tripleStore, Uri dataExtractGraph, Uri metadataExtractGraph)
@@ -347,30 +262,6 @@ public class CoscineMetadataExtractor : IMetadataExtractor
}
}
private static void AddToTrellis(IGraph trellisGraph, string ldpAssignment, string thePartUri, string graphUri, ICollection<Triple> triples)
{
var setGraphNode = trellisGraph.CreateUriNode(new Uri(graphUri));
var setThePartNode = trellisGraph.CreateUriNode(new Uri(thePartUri));
var triple = new Triple(
setGraphNode,
trellisGraph.CreateUriNode(new Uri(partOfUri)),
setThePartNode
);
if (!trellisGraph.ContainsTriple(triple))
{
triples.Add(triple);
trellisGraph.Assert(triple);
var assignmentTriple = new Triple(
setGraphNode,
trellisGraph.CreateUriNode(new Uri(aUri)),
trellisGraph.CreateUriNode(new Uri(ldpAssignment))
);
triples.Add(assignmentTriple);
trellisGraph.Assert(assignmentTriple);
AddModifiedDate(trellisGraph, graphUri, triples);
}
}
private IGraph CreateOrGetGraph(string graphUrl)
{
var entryAlreadyExists = _rdfStoreConnector.HasGraph(graphUrl);
@@ -382,22 +273,4 @@ public class CoscineMetadataExtractor : IMetadataExtractor
};
}
private static void AddModifiedDate(IGraph graph, string root, ICollection<Triple> triples)
{
var dcTermsModifiedNode = graph.CreateUriNode(new Uri(dctermsModifiedUri));
var rootNode = graph.CreateUriNode(new Uri(root));
if (!graph.GetTriplesWithSubjectPredicate(rootNode, dcTermsModifiedNode).Any())
{
var triple = new Triple(
rootNode,
dcTermsModifiedNode,
graph.CreateLiteralNode(
DateTime.UtcNow.ToString("o", CultureInfo.InvariantCulture),
new Uri(XmlSpecsHelper.XmlSchemaDataTypeDateTime)
)
);
triples.Add(triple);
graph.Assert(triple);
}
}
}
\ No newline at end of file
Loading