diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..594b9a16db8c7076862ba56654433a9adde247f3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*venv* +docs/macros/__pycache__/* +public/* \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..2b371901915503f2c783537c238698e30f19f861 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,25 @@ +image: python:3.12-alpine + +before_script: + - pip install -r requirements.txt + - pip install -e scripts + +test: + stage: test + script: + - mkdocs build --strict --verbose --site-dir test + artifacts: + paths: + - test + # rules: + # - if: $CI_COMMIT_REF_NAME != $CI_DEFAULT_BRANCH + +pages: + stage: deploy + script: + - mkdocs build --strict --verbose --site-dir public + artifacts: + paths: + - public + # rules: + # - if: $CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH diff --git a/README.md b/README.md index afdcdbd1aea7bcb863bdfe04111e5d2591d7340d..094b43d73fdfc4a01055132ea7b4b76635e82b3b 100644 --- a/README.md +++ b/README.md @@ -1,162 +1,70 @@ -# KnowledgeHub - Domain Coverage +# NFDI4Earth - KnowledgeHub - KnowledgeGraph Analysis -This is collection of relevant questions and corresponding SPARQL-Queries, that answer those questions. The questions are grouped according to the different entities of interest (datasets, organizations, ...). The entities appear in alphabetical order. The first query is useful to get an overview of all entities available in the Knowledge Hub. The questions listed below form, altogether, the domain coverage of the Knowledge Hub. For details, see the [NFDI4Earth Deliverable D4.3.2](https://zenodo.org/records/7950860). +A collection of SPARQL queries for analyzing the KnowledgeGraph of NFDI4Earth's KnowledgeHub. -***Overview of the types of entities*** -| ID | Question | Query/ies | -|---|---|---| -| TY001 | What are the types of entities available in the knowledge graph? | [TY001](queries/TY001.rq)| +## Repository Structure - - +``` +├── docs/ +│ ├── index.md # Main documentation page +│ ├── examples.md +│ ├── questions.md +│ ├── macros/ +│ ├── metrics/ +│ │ ├── general_metrics/ +│ │ ├── resource_metrics/ +│ │ └── schema_complexity_metrics/ +│ └── queries/ # Query documentation +│ ├── examples/ +│ ├── metrics/ +│ └── questions/ +│ +└── scripts/ # Python analysis tools + └── kg_analysis/ + └── ... +``` -### Aggregator +## Usage -| ID | Question | Query/ies | -|---|---|---| -| AG001 | What are all entities of type Aggregator? | [AG001](queries/AG001.rq)| -| AG001_2 | What are name and geometry of Aggregator? | [AG001_2](queries/AG001_2.rq)| -| AG002_1 | What are all attributes available for the type "Aggregator"? | [AG002_1](queries/AG002_1.rq)| -| AG002_2 | How many attributes are available for the type "Aggregator"? | [AG002_2](queries/AG002_2.rq)| +The repository consists of three main components: -### Article +1. **Documentation** (`/docs`): Comprehensive documentation of queries, examples and metrics +2. **Query Collection** (`/docs/queries`): SPARQL queries organized by purpose +3. **Analysis Tool** (`/scripts`): Python package for executing queries and calculating metrics -| ID | Question | Query/ies | -|---|---|---| -| AT001 | What are all entities of type schema:Article? | [AT001](queries/AT001.rq)| -| AT002_1 | What are all attributes available for the type "schema:Article"? | [AT002_1](queries/AT002_1.rq)| -| AT002_2 | How many attributes are available for the type "schema:Article"? | [AT002_2](queries/AT002_2.rq)| +### Local Development -### Dataset +1. Setup virtual environment: +```bash +python3 -m venv venv +source venv/bin/activate +``` -| ID | Question | Query/ies | -|---|---|---| -| DA001 | What are all entities of type dcat:Dataset? | [DA001](queries/DA001.rq)| -| DA002_1 | What are all attributes available for the type "dcat:Dataset"? | [DA002_1](queries/DA002_1.rq)| -| DA002_2 | How many attributes are available for the type "dcat:Dataset"? | [DA002_2](queries/DA002_2.rq)| -| DA003_1 | What are the datasets having the string 'world settlement footprint' in title or description? | [DA003_1](queries/DA003_1.rq)| +2. Install dependencies: +```bash +pip install -r requirements.txt +pip install -e scripts/ +``` -### LHBArticle +3. Start documentation server: +```bash +mkdocs serve +``` -| ID | Question | Query/ies | -|---|---|---| -| LH001 | What are all entities of type LHBArticle? | [LH001](queries/LH001.rq)| -| LH002_1 | What are all attributes available for the type "LHBArticle"? | [LH002_1](queries/LH002_1.rq)| -| LH002_2 | How many attributes are available for the type "LHBArticle"? | [LH002_2](queries/LH002_2.rq)| +## Contributing -### LearningResource +We welcome contributions: -| ID | Question | Query/ies | -|---|---|---| -| LR001 | What are all entities of type LearningResource? | [LR001](queries/LR001.rq)| -| LR002_1 | What are all attributes available for the type "LearningResource"? | [LR002_1](queries/LR002_1.rq)| -| LR002_2 | How many attributes are available for the type "LearningResource"? | [LR002_2](queries/LR002_2.rq)| +- New SPARQL queries +- Documentation improvements +- Tool enhancements -### MetadataStandard +### Contributors -| ID | Question | Query/ies | -|---|---|---| -| MS001 | What are all entities of type MetadataStandard? | [MS001](queries/MS001.rq)| -| MS002_1 | What are all attributes available for the type "MetadataStandard"? | [MS002_1](queries/MS002_1.rq)| -| MS002_2 | How many attributes are available for the type "MetadataStandard"? | [MS002_2](queries/MS002_2.rq)| +Ralf Klammer, Auriol Degbelo, Jonas Grieb -### Organization +## Contact -| ID | Question | Query/ies | -|---|---|---| -| OG001 | What are all entities of type Organization? | [OG001](queries/OG001.rq)| -| OG002_1 | What are all attributes available for the type "Organization"? | [OG002_1](queries/OG002_1.rq)| -| OG002_2 | How many attributes are available for the type "Organization"? | [OG002_2](queries/OG002_2.rq)| - -### Person - -| ID | Question | Query/ies | -|---|---|---| -| PE001 | What are all entities of type Person? | [PE001](queries/PE001.rq)| -| PE002_1 | What are all attributes available for the type "Person"? | [PE002_1](queries/PE002_1.rq)| -| PE002_2 | How many attributes are available for the type "Person"? | [PE002_2](queries/PE002_2.rq)| - -### Registry - -| ID | Question | Query/ies | -|---|---|---| -| REG001 | What are all entities of type Registry? | [REG001](queries/REG001.rq)| -| REG002_1 | What are all attributes available for the type "Registry"? | [REG002_1](queries/REG002_1.rq)| -| REG002_2 | How many attributes are available for the type "Registry"? | [REG002_2](queries/REG002_2.rq)| - -### Repository - -| ID | Question | Query/ies | -|---|---|---| -| REP001 | What are all entities of type Repository? | [REP001](queries/REP001.rq)| -| REP002_1 | What are all attributes available for the type "Repository"? | [REP002_1](queries/REP002_1.rq)| -| REP002_2 | How many attributes are available for the type "Repository"? | [REP002_2](queries/REP002_2.rq)| - -### ResearchProject - -| ID | Question | Query/ies | -|---|---|---| -| RP001 | What are all entities of type ResearchProject? | [RP001](queries/RP001.rq)| -| RP002_1 | What are all attributes available for the type "ResearchProject"? | [RP002_1](queries/RP002_1.rq)| -| RP002_2 | How many attributes are available for the type "ResearchProject"? | [RP002_2](queries/RP002_2.rq)| - - -### SoftwareSourceCode - -| ID | Question | Query/ies | -|---|---|---| -| SC001 | What are all entities of type SoftwareSourceCode? | [SC001](queries/SC001.rq)| -| SC002_1 | What are all attributes available for the type "SoftwareSourceCode"? | [SC002_1](queries/SC002_1.rq)| -| SC002_2 | How many attributes are available for the type "SoftwareSourceCode"? | [SC002_2](queries/SC002_2.rq)| - - -<!--- Template for a new table (including first line) - -### EntityType - -| ID | Question | Query/ies | -|---|---|---| -| XX001 | What are all entities of type EntityType? | [XX001](queries/XX001.rq)| - ---> - - -<!--- - - -### Organizations - -| ID | Question | Query/ies | -|---|---|---| -| OR001 | What is the URL of the homepage for the organization with the following name: 'Karlsruhe Institute of Technology'? | [OR001_1](queries/OR001_1.rq),[OR001_2](queries/OR001_2.rq) | -| OR002 | What is the URL of the homepage for the organization with the following ID: 'https://nfdi4earth-knowledgehub.geo.tu-dresden.de/api/objects/n4ekh/a38143be5e15bed94a20' | [OR003_1](queries/OR003_1.rq) | -| OR003 | Which organizations have not defined any homepage? | [OR003_1](queries/OR003_1.rq) | -| OR004 | Which services are published by the organization? | [OR004_1](queries/OR004_1.rq) | -| OR005 | What is the geolocation of the organization called 'TU Dresden'? | [OR005_1](queries/OR005_1.rq) | -| OR006 | What is the geolocation of all organizations, that are members of the NFDI4Earth consortium? | [OR006_1](queries/OR006_1.rq) | - -### Repositories - -| ID | Question | Query/ies | -|----|----------|-----------| -| DR1 | At which repository can I archive my [geophysical] data of [2] GB?| [OR004_1](queries/OR004_1.rq) | -| DR2 | What is the temporal coverage of a data repository?|| -| DR3 | What is the spatial coverage of a data repository?|| -| DR4 | What is the curation policy of the data repository?|| -| DR5 | Which licences are supported by the data repository?|| -| DR6 | Does the repository give identifiers for its ressources?|| -| DR7 | Which metadata harversting interface is supported by the repository?|| -| DR8 | Which type of (persistent) identifiers are used by the repository?|| -| DR9 | What is the thematic area/subject of a repository?|| -| DR10 | Limitations of data deposit at the repository?|| -| DR11 | When was the medatada for a given repository first collected/last updated?|| -| DR12 | Is the repository still available?|| -| DR13 | Which repository allows long term archiving?|| - ---> - -# Notes - -This question-based approach takes inspiration from the [GeoSPARQLBenchmark](https://github.com/OpenLinkSoftware/GeoSPARQLBenchmark). - -It is directly linked to the [Knowledge Hub landing page project](https://git.rwth-aachen.de/nfdi4earth/knowledgehub/kh_landingpage) as all the questions and examples are taken to explain the basic idea and demonstrate usage of the [Knowledge Hub](https://knowledgehub.nfdi4earth.de). +For questions about the NFDI4Earth KnowledgeHub Graph: +- Email: [helpdesk@nfdi4earth.de](mailto:helpdesk@nfdi4earth.de?subject=[NFDI4Earth][KnowlegeGraph]) +- Website: [https://knowledgehub.nfdi4earth.de/](https://knowledgehub.nfdi4earth.de) \ No newline at end of file diff --git a/docs/assets/NFDI4Earth_Symbol.png b/docs/assets/NFDI4Earth_Symbol.png new file mode 100644 index 0000000000000000000000000000000000000000..1a357281f6c49602f4c52ce75850e81db513ae8f Binary files /dev/null and b/docs/assets/NFDI4Earth_Symbol.png differ diff --git a/docs/assets/favicon.ico b/docs/assets/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..83fd5498b6832c052576e708cbd4e5e5eda77d6d Binary files /dev/null and b/docs/assets/favicon.ico differ diff --git a/docs/examples.md b/docs/examples.md new file mode 100644 index 0000000000000000000000000000000000000000..7f952e7f7af8028a4090a2d40ff94b59044a5c27 --- /dev/null +++ b/docs/examples.md @@ -0,0 +1,124 @@ +# Examples + +This collection contains representative example queries that serve as entry points for using the NFDI4Earth KnowledgeGraph. + +## About this Collection + +The SPARQL queries presented here demonstrate fundamental query patterns and typical use cases. They are intentionally kept simple and serve as: + +- Examples for commonly needed query types +- Templates for your own, more complex queries +- Learning material for SPARQL beginners + +## Educational Resources Example + +This example demonstrates how to retrieve comprehensive information about educational resources from the NFDI4Earth Knowledge Hub. + +### Query Purpose + +The query finds all learning resources and their associated metadata, including: + +- Publishers and their names +- Subject areas with labels +- Licensing information +- Related topics and their titles + +### SPARQL Features Demonstrated + +- Use of `CONSTRUCT` for graph pattern matching +- Multiple `OPTIONAL` patterns for flexible data retrieval +- Property path navigation +- Handling of multiple vocabularies (schema.org, FOAF, DCT) + +### Query + +```sparql +{{ include_if_exists("examples/Educational resources.rq") }} +``` + +### Understanding the Results + +The query returns a graph structure where: + +- Each learning resource is connected to its direct properties +- Additional metadata about publishers, subjects, and licenses is included +- Labels and titles are retrieved for better human readability + + +## Repository Metadata Standards Example + +This example shows how to query metadata standards and API types supported by repositories in the NFDI4Earth Knowledge Hub. + +### Query Purpose + +The query retrieves information about: +- Repository names +- Supported metadata standards +- Available API types and interfaces + +### SPARQL Features Demonstrated + +- Basic `SELECT` query structure +- Multiple triple patterns +- Property traversal +- Use of domain-specific vocabularies (n4e) + +### Query + +```sparql +{{ include_if_exists("examples/Metadata.rq") }} +``` + +### Understanding the Results + +The query returns: + +- Repository names for clear identification +- Names of supported metadata standards +- Types of APIs available for each repository + +This information is particularly useful for: + +- Understanding repository capabilities +- Planning data integration +- Evaluating technical compatibility + +## NFDI4Earth Services Example + +This example demonstrates how to query services that are provided by organizations within the NFDI4Earth consortium. + +### Query Purpose + +The query identifies: +- Services (Repositories and Aggregators) +- Their names and types +- Publishing organizations within NFDI4Earth +- Including services from sub-organizations + +### SPARQL Features Demonstrated + +- Complex `SELECT` query with `GROUP_CONCAT` +- `UNION` patterns for alternative paths +- Organization hierarchy traversal +- Filter conditions with `NOT EXISTS` +- Value constraints using `VALUES` + +### Query + +```sparql +{{ include_if_exists("examples/Services in the NFDI4Earth.rq") }} +``` + +### Understanding the Results + +The query returns: + +- Service names and their types (Repository or Aggregator) +- Concatenated list of publishing organizations +- Services from both direct NFDI4Earth members and their sub-organizations + +This query is useful for: + +- Getting an overview of NFDI4Earth service landscape +- Understanding organizational relationships +- Service discovery and analysis \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000000000000000000000000000000000000..bedb30fc1ba37f0eaac4bd6f52410e1b2316b984 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,61 @@ +# Overview + +Welcome to the NFDI4Earth KnowledgeGraph Query Collection. This documentation provides insights into the NFDI4Earth KnowledgeHub Graph through SPARQL queries and their analysis. + +## About the Project + +The NFDI4Earth KnowledgeGraph represents a comprehensive network of earth science research data, connecting various domains, datasets, and research artifacts. Based on the NFDI4Earth metadata schema, it enables: + +- **Linked Research Data**: Integration of various data sources and research artifacts +- **Semantic Search**: Intelligent discovery of relevant resources +- **Community Integration**: Connecting researchers and their work + +## Purpose + +Through SPARQL queries, we explore: + +- **Data Discovery**: Finding relevant research data across earth science domains +- **Domain Coverage**: Understanding the breadth and depth of represented research areas +- **Graph Structure**: Analyzing the knowledge graph's characteristics and connectivity + +## Query Collection + +Our queries are organized into three main categories. + +[**1. Basic Examples**](./examples) + + - Common query patterns + - Graph exploration basics + - Getting started guides + +[**2. Domain Questions**](./questions) + + - Domain-specific research questions + - Real-world use cases + - Complex query patterns + +[**3. Graph Metrics**](./metrics) + + - Structural metrics + - Quality analysis + - Network characteristics + +## Getting Started + +Each query is documented with: +- Detailed description of the use case +- SPARQL code with explanations +- Example results and visualizations +- Interpretation guidelines +- Performance considerations + +## Contributing + +We invite the community to contribute their own queries! + +## Resources + +- [NFDI4Earth OneStop4All](https://onestop4all.nfdi4earth.de) +- [KnowledgeHub Documentation](https://knowledgehub.nfdi4earth.de) +- [SPARQL Endpoint](https://sparql.knowledgehub.nfdi4earth.de) +- [GitHub Repository](https://git.rwth-aachen.de/nfdi4earth/knowledgehub/kh_questions) diff --git a/docs/macros/main.py b/docs/macros/main.py new file mode 100644 index 0000000000000000000000000000000000000000..86b58860be2d4c8359b3742bbbeeda88f3209ce9 --- /dev/null +++ b/docs/macros/main.py @@ -0,0 +1,133 @@ +import logging + +from kg_analysis.table_renderer import MetricsTableRenderer # type: ignore + +log = logging.getLogger(__name__) + + +def define_env(env): + @env.macro + def include_if_exists( + filename, + path="docs/queries", + start_line=None, + single_line=None, + ): + """ + Includes the content of a file if it exists. + + Args: + filename (str): The path to the file. + start_line (int, optional): The line number to start from. + single_line (int, optional): The specific line number to include. + + Returns: + str: The content of the file or an error message if the file + does not exist. + """ + try: + filename = f"{path}/{filename}" + with open(filename, "r") as f: + lines = f.readlines() + if start_line is not None: + return "".join(lines[start_line:]) + elif single_line is not None: + return lines[single_line] + return "".join(lines) + except FileNotFoundError: + return f"No results available. The file `{filename}` wasn't found.*" + except IndexError: + return f"*Error: The specified line {start_line} does not exist in `{filename}`.*" + + @env.macro + def include_template(template_path, resource_type, **kwargs): + """ + Includes a template file and replaces {resource_type} with the given + value. + + Args: + template_path (str): Path to the template file. + resource_type (str): The resource type URI to inject. + + Returns: + str: The content of the template with the resource type injected. + """ + content = include_if_exists(template_path, **kwargs) + if content: + return content.replace("{resource_type}", resource_type) + return "" + + @env.macro + def metrics_table_single_resource(resource_type=None): + """ + Renders a metrics table for a single resource type. + + Args: + resource_type (str, optional): The resource type URI. + + Returns: + str: The rendered metrics table. + """ + return MetricsTableRenderer( + table_type="resource", resource_type=resource_type + ).render() + + @env.macro + def metrics_table_overview_resource(): + """ + Renders an overview metrics table for all resource types. + + Returns: + str: The rendered overview metrics table. + """ + return MetricsTableRenderer(table_type="resource_overview").render() + + @env.macro + def metrics_table_overview_general(): + """ + Renders an overview metrics table for general metrics. + + Returns: + str: The rendered overview metrics table. + """ + return MetricsTableRenderer(table_type="general").render() + + @env.macro + def metrics_table_single_general(metric_key): + """ + Renders a metrics table for a single general metric. + + Args: + metric_key (str): The key of the general metric. + + Returns: + str: The rendered metrics table. + """ + return MetricsTableRenderer( + table_type="general", metric_key=metric_key + ).render() + + @env.macro + def metrics_table_overview_complexity(): + """ + Renders an overview metrics table for complexity metrics. + + Returns: + str: The rendered overview metrics table. + """ + return MetricsTableRenderer(table_type="complexity").render() + + @env.macro + def metrics_table_single_complexity(metric_key): + """ + Renders a metrics table for a single complexity metric. + + Args: + metric_key (str): The key of the complexity metric. + + Returns: + str: The rendered metrics table. + """ + return MetricsTableRenderer( + table_type="complexity", metric_key=metric_key + ).render() diff --git a/docs/macros/resource_metrics.md b/docs/macros/resource_metrics.md new file mode 100644 index 0000000000000000000000000000000000000000..ba82029cdbd2e2c535e2929759e37f7bb436eef4 --- /dev/null +++ b/docs/macros/resource_metrics.md @@ -0,0 +1,119 @@ +{% macro resource_metrics(resource_type, resource_type_uri) %} + +Resource type: {{resource_type_uri}} + +## Results +{{ metrics_table_single_resource(resource_type) }} + +## Basic Metrics + +### Number of Entitis + +see also: [general_metricsinstances](/metrics/general%20metrics/01_instances/) + +<i id="RM001_instances_template.rq">file: RM001_instances_template.rq</i> + +```sparql +{{ include_template("metrics/RM001_instances_template.rq", resource_type_uri) }} +``` +### Connectivity to other resources + +<i id="RM006_connectivity_template.rq">file: RM006_connectivity_template.rq</i> + +```sparql +{{ include_template("metrics/RM006_connectivity_template.rq", resource_type_uri) }} +``` + +### Number of Assertions + +see also: [general_metricsassortions](/metrics/general%20metrics/02_assertions/) + +<i id="RM002_assertions_template.rq">file: RM002_assertions_template.rq</i> + +```sparql +{{ include_template("metrics/RM002_assertions_template.rq", resource_type_uri) }} +``` + +### Average linkage + +see also: [general_metricslinkage](/metrics/general%20metrics/03_linkage_degree/) + +<i id="RM003_linkage_template.rq">file: RM003_linkage_template.rq</i> + +```sparql +{{ include_template("metrics/RM003_linkage_template.rq", resource_type_uri) }} +``` + +### Outgoing Edges Statistics + +see also: [general_metricsoutgoing edges](/metrics/general%20metrics/04_outgoing_edges/) + +#### Total outgoing edges + +<i id="RM004_1_out_edges_total_template.rq">file: RM004_1_out_edges_total_template.rq</i> + +```sparql +{{ include_template("metrics/RM004_1_out_edges_total_template.rq", resource_type_uri) }} +``` + +#### Minimum outgoing edges + +<i id="RM004_2_out_edges_min_template.rq">file: RM004_2_out_edges_min_template.rq</i> + +```sparql +{{ include_template("metrics/RM004_2_out_edges_min_template.rq", resource_type_uri) }} +``` + +#### Median outgoing edges + +<i id="RM004_3_out_edges_median_template.rq">file: RM004_3_out_edges_median_template.rq</i> + +```sparql +{{ include_template("metrics/RM004_3_out_edges_median_template.rq", resource_type_uri) }} +``` + +#### Maximum outgoing edges + +<i id="RM004_4_out_edges_max_template.rq">file: RM004_4_out_edges_max_template.rq</i> + +```sparql +{{ include_template("metrics/RM004_4_out_edges_max_template.rq", resource_type_uri) }} +``` + +### Incoming Edges Statistics + +see also: [general_metricsincoming edges](/metrics/general%20metrics/05_incoming_edges/) + +#### Total incoming edges + +<i id="RM005_1_in_edges_total_template.rq">file: RM005_1_in_edges_total_template.rq</i> + +```sparql +{{ include_template("metrics/RM005_1_in_edges_total_template.rq", resource_type_uri) }} +``` + +#### Minimum incoming edges + +<i id="RM005_2_in_edges_min_template.rq">file: RM005_2_in_edges_min_template.rq</i> + +```sparql +{{ include_template("metrics/RM005_2_in_edges_min_template.rq", resource_type_uri) }} +``` + +#### Median incoming edges + +<i id="RM005_3_in_edges_median_template.rq">file: RM005_3_in_edges_median_template.rq</i> + +```sparql +{{ include_template("metrics/RM005_3_in_edges_median_template.rq", resource_type_uri) }} +``` + +#### Maximum incoming edges + +<i id="RM005_4_in_edges_max_template.rq">file: RM005_4_in_edges_max_template.rq</i> + +```sparql +{{ include_template("metrics/RM005_4_in_edges_max_template.rq", resource_type_uri) }} +``` + +{% endmacro %} diff --git a/docs/metrics/general_metrics/01_instances.md b/docs/metrics/general_metrics/01_instances.md new file mode 100644 index 0000000000000000000000000000000000000000..57ae0b3a1e5b962e16c6467337f6c837ea184453 --- /dev/null +++ b/docs/metrics/general_metrics/01_instances.md @@ -0,0 +1,22 @@ +# Instances in a graph + +This metric counts the total number of instances (nodes) in an RDF graph. An instance is counted as any node that appears as either a subject or object in a triple. The instance count provides a fundamental measure of the graph's size and gives a first indication of its complexity. + +The metric helps to: + +- Understand the overall scale of the knowledge graph +- Track growth over time +- Compare different graph versions or datasets +- Establish a baseline for other metrics + +{{ metrics_table_single_general('instances') }} + +## Queries + +### Count Number of instances in a graph + +<i id="GM001.rq">file: GM001.rq</i> + +```sparql +{{ include_if_exists("metrics/GM001.rq") }} +``` diff --git a/docs/metrics/general_metrics/02_assertions.md b/docs/metrics/general_metrics/02_assertions.md new file mode 100644 index 0000000000000000000000000000000000000000..a3d13637dff05e9c8ac9d95cb225ba66b7c8594a --- /dev/null +++ b/docs/metrics/general_metrics/02_assertions.md @@ -0,0 +1,38 @@ +# Assertions in a graph + +This metric counts the total number of assertions (triples/edges) in an RDF graph. An assertion is any statement in the form of subject-predicate-object that exists in the graph. The assertion count provides a fundamental measure of the graph's connectivity and density. + +The metric helps to: + +- Measure the total number of relationships in the graph +- Understand the graph's density +- Track the growth of relationships over time +- Compare connectivity between different graph versions + +{{ metrics_table_single_general('assertions') }} + +## Queries + +### Count Total Number of Assertions + +<i id="GM002_1.rq">file: GM002_1.rq</i> + +```sparql +{{ include_if_exists("metrics/GM002_1.rq") }} +``` + +### Count Number of Entity-to-Entity Assertions + +<i id="GM002_2.rq">file: GM002_2.rq</i> + +```sparql +{{ include_if_exists("metrics/GM002_2.rq") }} +``` + +### Count Number of Entity-to-Literal Assertions + +<i id="GM002_3.rq">file: GM002_3.rq</i> + +```sparql +{{ include_if_exists("metrics/GM002_3.rq") }} +``` diff --git a/docs/metrics/general_metrics/03_linkage_degree.md b/docs/metrics/general_metrics/03_linkage_degree.md new file mode 100644 index 0000000000000000000000000000000000000000..defdff0f968396e49a5af5f2bc049f1d161461b8 --- /dev/null +++ b/docs/metrics/general_metrics/03_linkage_degree.md @@ -0,0 +1,59 @@ +# Linkage Degree Analysis + +This metric analyzes the connectivity patterns in the graph by measuring the linkage degree of entities. The linkage degree represents how well entities are connected to other entities through relationships. It helps understand the graph's structural characteristics and identifies patterns of connectivity. + +The metric provides insights into: + +- Average number of relationships per entity +- Distribution of connections across the graph +- Identification of highly connected or isolated entities +- Overall graph connectivity patterns + +{{ metrics_table_single_general('linkage') }} + +## Queries + +### Average Linkage Degree + +This query calculates the average number of relationships (both incoming and outgoing) per entity in the graph. + +<i id="GM003_1.rq">file: GM003_1.rq</i> + +```sparql +{{ include_if_exists("metrics/GM003_1.rq") }} +``` + +??? warning "Batch Processing Limitation" + The following queries use batch processing with a limitation of 1000 entities. While this enables quick results, it leads to approximate values. The results only approach the actual average value when using batch sizes in the six-figure range. + + For precise analysis, either adjust the LIMIT value accordingly or use the non-batch version. + +### Average Linkage Degree (Batch Processing) + +This is an optimized version of the linkage degree calculation that uses batch processing. It's particularly useful for large datasets where the standard query might timeout or consume too many resources. The query processes a limited number of entities at a time. + +<i id="GM003_2.rq">file: GM003_2.rq</i> + +```sparql +{{ include_if_exists("metrics/GM003_2.rq") }} +``` + +### Average Outgoing Linkage Degree (Batch Processing) + +This query focuses specifically on outgoing relationships, calculating the average number of outgoing edges per entity. It uses batch processing for efficient execution on larger datasets. + +<i id="GM003_3.rq">file: GM003_3.rq</i> + +```sparql +{{ include_if_exists("metrics/GM003_3.rq") }} +``` + +### Average Incoming Linkage Degree (Batch Processing) + +This query calculates the average number of incoming relationships per entity, providing insights into how frequently entities are referenced by others in the graph. It also uses batch processing for efficiency. + +<i id="GM003_4.rq">file: GM003_4.rq</i> + +```sparql +{{ include_if_exists("metrics/GM003_4.rq") }} +``` diff --git a/docs/metrics/general_metrics/04_edges_incoming.md b/docs/metrics/general_metrics/04_edges_incoming.md new file mode 100644 index 0000000000000000000000000000000000000000..c4ad4230a5589495ea13bad7ab93ca4bda5cb01f --- /dev/null +++ b/docs/metrics/general_metrics/04_edges_incoming.md @@ -0,0 +1,88 @@ +# Incoming Edges + +This metric determines the median number of incoming edges across all nodes in the graph. The calculation requires multiple steps. + +## Understanding Incoming Edges + +In a knowledge graph, incoming edges represent relationships where other nodes point to or reference a particular node. Think of them as "arrows" pointing towards a node. For example: + +- If Dataset A `references` Resource B, then Resource B has an incoming edge +- If Organization X `publishes` Dataset Y, then Dataset Y has an incoming edge +- If Service M `supports` Standard N, then Standard N has an incoming edge + +### Significance + +The number of incoming edges can indicate: +- How frequently a resource is referenced or used +- The centrality or importance of a node in the network +- Potential bottlenecks or key connection points +- The interconnectedness of different resources + +### Example + +``` +Resource A ---hasLicense---> License X (X has 1 incoming edge) +Resource B ---hasLicense---> License X (X now has 2 incoming edges) +Resource C ---hasLicense---> License X (X now has 3 incoming edges) +``` + +```turtle +@prefix ex: <http://example.org/> . +@prefix dct: <http://purl.org/dc/terms/> . + +ex:ResourceA dct:license ex:LicenseX . +ex:ResourceB dct:license ex:LicenseX . +ex:ResourceC dct:license ex:LicenseX . +``` + +In this example, License X has 3 incoming edges, indicating it's a commonly used license in the graph. + +{{ metrics_table_single_general('edges_in') }} + +## Queries + +### Step 1: Count Incoming Edges Per Node + +```sparql +{{ include_if_exists("metrics/GM005_1.rq") }} +``` + +### Step 2: Get Total Number of Nodes + +<i id="GM005_2.rq">file: GM005_2.rq</i> + +```sparql +{{ include_if_exists("metrics/GM005_2.rq") }} +``` + +### Step 3: Calculate Median Position + +Using the total node count (n), median position is: position = (n+1)/2 + +```sparql +{{ include_if_exists("metrics/GM005_3.rq") }} +``` + +### Step 4: Get Median Value + +<i id="GM005_4.rq">file: GM005_4.rq</i> + +```sparql +{{ include_if_exists("metrics/GM005_4.rq") }} +``` + +### Step 5: Get Minimum Value + +<i id="GM005_5.rq">file: GM005_5.rq</i> + +```sparql +{{ include_if_exists("metrics/GM005_5.rq") }} +``` + +### Step 6: Get Maximum Value + +<i id="GM005_6.rq">file: GM005_6.rq</i> + +```sparql +{{ include_if_exists("metrics/GM005_6.rq") }} +``` diff --git a/docs/metrics/general_metrics/05_edges_outgoing.md b/docs/metrics/general_metrics/05_edges_outgoing.md new file mode 100644 index 0000000000000000000000000000000000000000..4b96894f115557644cc4967cb1589b1a50169d5c --- /dev/null +++ b/docs/metrics/general_metrics/05_edges_outgoing.md @@ -0,0 +1,90 @@ +# Outgoing Edges + +This metric determines the median number of outgoing edges across all nodes in the graph. The calculation requires multiple steps. + +## Understanding Outgoing Edges + +In a knowledge graph, outgoing edges represent relationships where a node points to or references other nodes. Think of them as "arrows" pointing away from a node. For example: + +- If Dataset A `references` Resource B, then Dataset A has an outgoing edge +- If Organization X `publishes` Dataset Y, then Organization X has an outgoing edge +- If Service M `supports` Standard N, then Service M has an outgoing edge + +### Significance + +The number of outgoing edges can indicate: +- How many relationships a node initiates +- The completeness of resource descriptions +- Connection patterns and data modeling practices +- The level of detail in resource metadata + +### Example + +``` +Dataset A ---dct:license-----> License X (A has now 1 outgoing edge) +Dataset A ---dct:publisher---> Organization Y (A has now 2 outgoing edges) +Dataset A ---schema:about----> Topic Z (A has now 3 outgoing edges) +``` + +```turtle +@prefix ex: <http://example.org/> . +@prefix dct: <http://purl.org/dc/terms/> . +@prefix schema: <http://schema.org/> . + +# Dataset has 3 outgoing edges +ex:DatasetA dct:license ex:LicenseX ; + dct:publisher ex:OrganizationY ; + schema:about ex:TopicZ . +``` + +In this example, Dataset A has 3 outgoing edges, demonstrating a well-described resource with license, publisher, and topic information. + +{{ metrics_table_single_general('edges_out') }} + +## Queries + +### Step 1: Count Outgoing Edges Per Node + +```sparql +{{ include_if_exists("metrics/GM004_1.rq") }} +``` + +### Step 2: Get Total Number of Nodes + +<i id="GM004_2.rq">file: GM004_2.rq</i> + +```sparql +{{ include_if_exists("metrics/GM004_2.rq") }} +``` + +### Step 3: Calculate Median Position + +Using the total node count (n), median position is: position = (n+1)/2 + +```sparql +{{ include_if_exists("metrics/GM004_3.rq") }} +``` + +### Step 4: Get Median Value(s) + +<i id="GM004_4.rq">file: GM004_4.rq</i> + +```sparql +{{ include_if_exists("metrics/GM004_4.rq") }} +``` + +### Step 5: Get Minimum Value + +<i id="GM004_5.rq">file: GM004_5.rq</i> + +```sparql +{{ include_if_exists("metrics/GM004_5.rq") }} +``` + +### Step 6: Get Maximum Value + +<i id="GM004_6.rq">file: GM004_6.rq</i> + +```sparql +{{ include_if_exists("metrics/GM004_6.rq") }} +``` diff --git a/docs/metrics/index.md b/docs/metrics/index.md new file mode 100644 index 0000000000000000000000000000000000000000..d58e1a54bbaccf3de2969ed6207d82ab3bb53a7b --- /dev/null +++ b/docs/metrics/index.md @@ -0,0 +1,115 @@ +# Overview + +The NFDI4Earth Knowledge Graph metrics provide quantitative insights into our semantic data structure. We distinguish between two main metric categories: + +## General Metrics + +These metrics analyze the entire knowledge graph structure and provide insights into: + +- [Overall size and complexity](general_metrics/01_instances) +- [Graph density and distribution](general_metrics/02_assertions) +- [Linkage](general_metrics/03_linkage_degree) +- Edge statistics ([incoming](general_metrics/04_edges_incoming)/[outgoing](general_metrics/05_edges_outgoing)) + +### Results + +{{ metrics_table_overview_general() }} + +## Resource specific Metrics + +These metrics focus on specific resource types within the knowledge graph, analyzing key entities in the earth science domain: + +- Research outputs ([datasets](resource_metrics/dataset), [publications](resource_metrics/publication), [articles](resource_metrics/article_lhb)) +- Infrastructure components ([repositories](resource_metrics/repository), [services](resource_metrics/service), [software](resource_metrics/software)) +- [Learning materials and standards](resource_metrics/learning_resource) +- [Organizations](resource_metrics/organization) and [people](resource_metrics/person) +- Digital resources ([data services](resource_metrics/data_service), [registries](resource_metrics/registry), [aggregators](resource_metrics/aggregator)) + +Each resource type is analyzed individually to understand its representation, completeness, and interconnections within the knowledge graph: + +### Results + +{{ metrics_table_overview_resource() }} + +## Schema Complexity Metrics + +To calculate the complexity of RDF schemas, we combine the presented formality metrics focusing on both structural and semantic aspects. + +### 1. Basic Structural Complexity + +These metrics measure the size and diversity of the RDF schema: + +- **[Number of classes](./schema_complexity_metrics/01_classes)** (_C_) → More classes indicate a more complex schema. +- **[Number of properties](./schema_complexity_metrics/02_properties)** (_P_) → More properties mean more relationships between entities. +- **[Average class hierarchy depth](./schema_complexity_metrics/03_depth)** (_D_avg_) → The mean number of hierarchy levels in the schema. +- **[Average class hierarchy width](./schema_complexity_metrics/04_width)** (_W_avg_) → The mean number of sibling classes at each hierarchy level. + +A structural complexity score can be calculated as: + + C_structural = w1 * C + w2 * P + w3 * D_avg + w4 * W_avg + +where _w_i_ are weights that determine the relative importance of each factor. + +### 2. Semantic Complexity + +If OWL is used, the complexity increases due to advanced semantics: + +- **[Number of restrictions](./schema_complexity_metrics/05_restrictions)** (_R_) → More restrictions indicate more constraints and rules. +- **[Number of logical axioms](./schema_complexity_metrics/06_axioms)** (_A_) → More axioms mean more logical statements and inferences. + +A semantic complexity score can be calculated as: + + C_semantic = w5 * R + w6 * A + +where _w_i_ are weights that determine the relative importance of each factor. + +### 3. Combined Formula for Schema Complexity + +To create an overall complexity score, we can combine both structural and semantic aspects: + + C_schema = w1 * C + w2 * P + w3 * D_avg + w4 * W_avg + w5 * R + w6 * A + +where _w_i_ are weights that determine the relative importance of each factor. + +This formula provides a single numerical value representing schema complexity, which can be normalized (e.g., 0–100) for comparison across different RDF schemas. + +### Results + +{{ metrics_table_overview_complexity() }} + +### References + +> Gómez-Pérez et al. (2004) – "Evaluation of Ontologies" - Describes metrics like number of classes, hierarchy depth, and relations +> +> *[https://link.springer.com/chapter/10.1007/978-3-540-30202-5_3](https://link.springer.com/chapter/10.1007/978-3-540-30202-5_3)* + +2) OWL and Schema Complexity Measurements: + +> Tartir & Arpinar (2010) – "Ontology Evaluation and Ranking using OntoQA" - Develops the OntoQA model combining structural and semantic metrics +> +> *[https://ieeexplore.ieee.org/document/4338348](https://ieeexplore.ieee.org/document/4338348)* + +3) SPARQL Analysis and RDF Complexity: + +> Lanzenberger et al. (2008) – "Ontology Evaluation – State of the Art" - Describes hierarchical depth as key metric for RDF schema complexity +> +> *[https://doi.org/10.1007/978-3-540-92673-3_10](https://doi.org/10.1007/978-3-540-92673-3_10)* + +## About the Metrics + +All metrics: + +- Are implemented as SPARQL queries (stored as `.rq` files) +- Can be executed against the [NFDI4Earth KnowledgeGraph endpoint](https://sparql.knowledgehub.nfdi4earth.de) +- Include execution timestamps + +## Purpose + +These measurements help us to: + +- Monitor the knowledge graph's growth and development +- Evaluate the coverage of earth science domains +- Identify areas for potential improvement +- Understand interconnections between different resource types +- Guide data quality improvements +- Track the integration of new resources diff --git a/docs/metrics/resource_metrics/aggregator.md b/docs/metrics/resource_metrics/aggregator.md new file mode 100644 index 0000000000000000000000000000000000000000..e40651d3cbc8dd90e384746a5384d3b28cd288c8 --- /dev/null +++ b/docs/metrics/resource_metrics/aggregator.md @@ -0,0 +1,4 @@ +{% from "macros/resource_metrics.md" import resource_metrics with context %} +# Aggregator + +{{ resource_metrics("aggregator", "<http://nfdi4earth.de/ontology/Aggregator>") }} diff --git a/docs/metrics/resource_metrics/article_lhb.md b/docs/metrics/resource_metrics/article_lhb.md new file mode 100644 index 0000000000000000000000000000000000000000..8ebe52a27a425eab07a07e628b4a5c2e71f3dd7f --- /dev/null +++ b/docs/metrics/resource_metrics/article_lhb.md @@ -0,0 +1,4 @@ +{% from "macros/resource_metrics.md" import resource_metrics with context %} +# Living Handbook Article + +{{ resource_metrics("article_lhb", "<http://nfdi4earth.de/ontology/LHBArticle>") }} diff --git a/docs/metrics/resource_metrics/data_service.md b/docs/metrics/resource_metrics/data_service.md new file mode 100644 index 0000000000000000000000000000000000000000..c43d8c1f79d2f2b077ed3119de954cebce5ed5c5 --- /dev/null +++ b/docs/metrics/resource_metrics/data_service.md @@ -0,0 +1,4 @@ +{% from "macros/resource_metrics.md" import resource_metrics with context %} +# Data Service + +{{ resource_metrics("data_service", "<http://www.w3.org/ns/dcat#DataService>") }} diff --git a/docs/metrics/resource_metrics/dataset.md b/docs/metrics/resource_metrics/dataset.md new file mode 100644 index 0000000000000000000000000000000000000000..5d52a5f62476ec8f489c7a517401d75ae1cf3931 --- /dev/null +++ b/docs/metrics/resource_metrics/dataset.md @@ -0,0 +1,4 @@ +{% from "macros/resource_metrics.md" import resource_metrics with context %} +# Dataset + +{{ resource_metrics("dataset", "<http://www.w3.org/ns/dcat#Dataset>") }} diff --git a/docs/metrics/resource_metrics/learning_resource.md b/docs/metrics/resource_metrics/learning_resource.md new file mode 100644 index 0000000000000000000000000000000000000000..a4246ce84f5f8d9a95c58b0544cf594d24252c68 --- /dev/null +++ b/docs/metrics/resource_metrics/learning_resource.md @@ -0,0 +1,4 @@ +{% from "macros/resource_metrics.md" import resource_metrics with context %} +# Learning Resource + +{{ resource_metrics("learning_resource", "<http://schema.org/LearningResource>") }} diff --git a/docs/metrics/resource_metrics/organization.md b/docs/metrics/resource_metrics/organization.md new file mode 100644 index 0000000000000000000000000000000000000000..42c1d8ea9d244b92be495574b59a290126beb711 --- /dev/null +++ b/docs/metrics/resource_metrics/organization.md @@ -0,0 +1,4 @@ +{% from "macros/resource_metrics.md" import resource_metrics with context %} +# Organization + +{{ resource_metrics("organization", "<http://xmlns.com/foaf/0.1/Organization>") }} diff --git a/docs/metrics/resource_metrics/person.md b/docs/metrics/resource_metrics/person.md new file mode 100644 index 0000000000000000000000000000000000000000..09b3e6456a10c2f5e9de54bf3e808438059c111a --- /dev/null +++ b/docs/metrics/resource_metrics/person.md @@ -0,0 +1,4 @@ +{% from "macros/resource_metrics.md" import resource_metrics with context %} +# Person + +{{ resource_metrics("person", "<http://schema.org/Person>") }} diff --git a/docs/metrics/resource_metrics/publication.md b/docs/metrics/resource_metrics/publication.md new file mode 100644 index 0000000000000000000000000000000000000000..d8732849d083a61d10f8bcf742e728f544a6fe45 --- /dev/null +++ b/docs/metrics/resource_metrics/publication.md @@ -0,0 +1,4 @@ +{% from "macros/resource_metrics.md" import resource_metrics with context %} +# Publication + +{{ resource_metrics("publication", "<http://nfdi4earth.de/ontology/Publication>") }} diff --git a/docs/metrics/resource_metrics/registry.md b/docs/metrics/resource_metrics/registry.md new file mode 100644 index 0000000000000000000000000000000000000000..499b9a276480e10802005bae732f23b04b5a347f --- /dev/null +++ b/docs/metrics/resource_metrics/registry.md @@ -0,0 +1,4 @@ +{% from "macros/resource_metrics.md" import resource_metrics with context %} +# Registry + +{{ resource_metrics("registry", "<http://nfdi4earth.de/ontology/Registry>") }} diff --git a/docs/metrics/resource_metrics/repository.md b/docs/metrics/resource_metrics/repository.md new file mode 100644 index 0000000000000000000000000000000000000000..6593bba91e962e182396212de33333baf3b8a23f --- /dev/null +++ b/docs/metrics/resource_metrics/repository.md @@ -0,0 +1,4 @@ +{% from "macros/resource_metrics.md" import resource_metrics with context %} +# Repository + +{{ resource_metrics("repository", "<http://nfdi4earth.de/ontology/Repository>") }} diff --git a/docs/metrics/resource_metrics/service.md b/docs/metrics/resource_metrics/service.md new file mode 100644 index 0000000000000000000000000000000000000000..b58e404b0e8786b49bd2ebc15118f45ea0d69b48 --- /dev/null +++ b/docs/metrics/resource_metrics/service.md @@ -0,0 +1,4 @@ +{% from "macros/resource_metrics.md" import resource_metrics with context %} +# Service + +{{ resource_metrics("service", "<http://www.w3.org/ns/sparql-service-description#Service>") }} diff --git a/docs/metrics/resource_metrics/software.md b/docs/metrics/resource_metrics/software.md new file mode 100644 index 0000000000000000000000000000000000000000..6210dea7f5c3569453f035967a65b33a5d678e85 --- /dev/null +++ b/docs/metrics/resource_metrics/software.md @@ -0,0 +1,4 @@ +{% from "macros/resource_metrics.md" import resource_metrics with context %} +# Tools & Software + +{{ resource_metrics("software", "<http://schema.org/SoftwareSourceCode>") }} diff --git a/docs/metrics/resource_metrics/standards.md b/docs/metrics/resource_metrics/standards.md new file mode 100644 index 0000000000000000000000000000000000000000..c8959778fb6648e3dc1feb5fceaf26b7555c0c94 --- /dev/null +++ b/docs/metrics/resource_metrics/standards.md @@ -0,0 +1,4 @@ +{% from "macros/resource_metrics.md" import resource_metrics with context %} +# Standards + +{{ resource_metrics("standards", "<http://nfdi4earth.de/ontology/MetadataStandard>") }} diff --git a/docs/metrics/schema_complexity_metrics/01_classes.md b/docs/metrics/schema_complexity_metrics/01_classes.md new file mode 100644 index 0000000000000000000000000000000000000000..c1f3b96300004990a1766f7583862a7906065630 --- /dev/null +++ b/docs/metrics/schema_complexity_metrics/01_classes.md @@ -0,0 +1,35 @@ +# Number of Schema Classes + +This metric counts the total number of classes defined in our schema. + +## Results + +{{ metrics_table_single_complexity('classes') }} + +## SPARQL Query + +```sparql +{{ include_if_exists("metrics/RF_001.rq") }} +``` + +## Description + +This query: + +- Counts distinct classes defined using rdfs:Class, owl:Class, or sh:NodeShape +- Excludes built-in classes from RDF, RDFS +- Returns a single number representing the total count of user-defined classes in the schema + +## Interpretation + +A higher number indicates: + +- More complex domain modeling +- Broader coverage of concepts +- More detailed classification system + +A lower number might suggest: + +- Simpler schema structure +- Focus on core concepts +- Potential for extended modeling diff --git a/docs/metrics/schema_complexity_metrics/02_properties.md b/docs/metrics/schema_complexity_metrics/02_properties.md new file mode 100644 index 0000000000000000000000000000000000000000..a0528016041478f9ea2f6928e54fe4c35d24e829 --- /dev/null +++ b/docs/metrics/schema_complexity_metrics/02_properties.md @@ -0,0 +1,55 @@ +# Number of Schema Properties + +This metric analyzes the properties defined in our schema through multiple aspects. + +## Results + +{{ metrics_table_single_complexity('properties') }} + +## Total Properties Count + +```sparql +{{ include_if_exists("metrics/RF_002_1.rq") }} +``` + +## Object Properties Count + +```sparql +{{ include_if_exists("metrics/RF_002_2.rq") }} +``` + +## Datatype Properties Count + +```sparql +{{ include_if_exists("metrics/RF_002_3.rq") }} +``` + +## Properties with Domain + +```sparql +{{ include_if_exists("metrics/RF_002_4.rq") }} +``` + +## Properties with Range + +```sparql +{{ include_if_exists("metrics/RF_002_5.rq") }} +``` + +## Combined Metrics Query + +```sparql +{{ include_if_exists("metrics/RF_002_6.rq") }} +``` + +## Interpretation + +Each metric provides specific insights: + +1. **Total** Properties: Overall schema complexity +2. **Object** Properties: Resource interlinking capability +3. **Datatype** Properties: Attribute richness +4. **Domain** Coverage: Property context definition +5. **Range** Coverage: Value constraints and type safety + +These separate metrics allow for more detailed analysis and easier maintenance. Additionally, the **combined query** provides a comprehensive overview in a single execution. diff --git a/docs/metrics/schema_complexity_metrics/03_depth.md b/docs/metrics/schema_complexity_metrics/03_depth.md new file mode 100644 index 0000000000000000000000000000000000000000..4602061ffdf8acb0f886cd4d3da66cc170c8312b --- /dev/null +++ b/docs/metrics/schema_complexity_metrics/03_depth.md @@ -0,0 +1,40 @@ +# Depth of Schema (Average) + +This metric calculates the average depth of the class hierarchy in the schema. + +## Results + +{{ metrics_table_single_complexity('depth') }} + + +## SPARQL Query + +```sparql +{{ include_if_exists("metrics/RF_003.rq") }} +``` + +## Description + +This query: +- Identifies all user-defined classes +- Calculates the path length to all superclasses +- Determines the average of these path lengths +- Excludes standard RDF/OWL classes + +## Interpretation + +A higher average value indicates: +- Deeper class hierarchies +- More detailed concept modeling +- Stronger specialization + +A lower value suggests: +- Flatter hierarchies +- Broader rather than deeper structuring +- Potentially easier maintenance + +## Notes + +- Depth is calculated by counting superclass relationships +- owl:Thing is not counted +- Multiple inheritance is considered \ No newline at end of file diff --git a/docs/metrics/schema_complexity_metrics/04_width.md b/docs/metrics/schema_complexity_metrics/04_width.md new file mode 100644 index 0000000000000000000000000000000000000000..3d07f42cde9557bd0a77dfcb8cb26d88b378a89d --- /dev/null +++ b/docs/metrics/schema_complexity_metrics/04_width.md @@ -0,0 +1,43 @@ +# Width of Schema (Average) + +This metric calculates the average number of subclasses per class (branching factor) in the schema. + +## Results + +{{ metrics_table_single_complexity('width') }} + +## SPARQL Query + +```sparql +{{ include_if_exists("metrics/RF_004.rq") }} +``` + +## Description + +This query: + +- Identifies all classes in the schema +- Counts the number of direct subclasses for each class +- Calculates the average number of subclasses (branching factor) +- Excludes built-in RDF/OWL classes +- Considers only direct subclass relationships (no transitive closure) + +## Interpretation + +A higher average width indicates: + +- Broader classification at each level +- More horizontal spread in the taxonomy +- Potentially flatter hierarchies + +A lower average width suggests: + +- More vertical organization +- More specialized hierarchies +- Potentially deeper class trees + +## Notes + +- Only counts direct subclass relationships +- Includes classes with no subclasses (count = 0) +- Multiple inheritance is handled correctly through DISTINCT counting \ No newline at end of file diff --git a/docs/metrics/schema_complexity_metrics/05_restrictions.md b/docs/metrics/schema_complexity_metrics/05_restrictions.md new file mode 100644 index 0000000000000000000000000000000000000000..7fa5d5e2362103426f7022f8fb60d8bc17e968ae --- /dev/null +++ b/docs/metrics/schema_complexity_metrics/05_restrictions.md @@ -0,0 +1,44 @@ +# Number of Restrictions + +This metric counts the various types of OWL restrictions defined in the schema. + +## Results + +{{ metrics_table_single_complexity('restrictions') }} + +## SPARQL Query + +```sparql +{{ include_if_exists("metrics/RF_006.rq") }} +``` + +## Description + +This query counts: + +- Total number of OWL restrictions +- Specific restriction types: + - someValuesFrom restrictions + - allValuesFrom restrictions + - hasValue restrictions + - Cardinality restrictions (min, max, exact) + +## Interpretation + +Higher numbers indicate: + +- More constrained schema +- More precise data modeling +- Higher validation requirements + +Breakdown by type shows: + +- Value constraints (someValues/allValues) +- Fixed value requirements (hasValue) +- Quantity rules (cardinality) + +## Notes + +- Excludes built-in OWL restrictions +- One restriction can have multiple types +- Cardinality includes min/max/exact \ No newline at end of file diff --git a/docs/metrics/schema_complexity_metrics/06_axioms.md b/docs/metrics/schema_complexity_metrics/06_axioms.md new file mode 100644 index 0000000000000000000000000000000000000000..66846dff4e68595e3139a009878ced0038e4a873 --- /dev/null +++ b/docs/metrics/schema_complexity_metrics/06_axioms.md @@ -0,0 +1,43 @@ +# Number of Logical Axioms + +This metric counts the various types of OWL logical axioms defined in the schema. + +## Results + +{{ metrics_table_single_complexity('axioms') }} + +## SPARQL Query + +```sparql +{{ include_if_exists("metrics/RF_006.rq") }} +``` + +## Description + +This query counts OWL logical axioms including: + +- Equivalent classes (owl:equivalentClass) +- Disjoint classes (owl:disjointWith) +- Complement classes (owl:complementOf) +- Intersection classes (owl:intersectionOf) +- Union classes (owl:unionOf) + +## Interpretation + +Higher numbers indicate: + +- More complex logical relationships +- Richer semantic modeling +- Greater inferencing potential + +Type distribution shows: + +- Class equivalence relationships +- Class disjointness constraints +- Complex class definitions + +## Notes + +- Excludes built-in OWL axioms +- Counts distinct class usages +- Combined total gives overall axiom complexity \ No newline at end of file diff --git a/examples/Educational resources.rq b/docs/queries/examples/Educational resources.rq similarity index 100% rename from examples/Educational resources.rq rename to docs/queries/examples/Educational resources.rq diff --git a/examples/Metadata.rq b/docs/queries/examples/Metadata.rq similarity index 100% rename from examples/Metadata.rq rename to docs/queries/examples/Metadata.rq diff --git a/examples/Services in the NFDI4Earth.rq b/docs/queries/examples/Services in the NFDI4Earth.rq similarity index 100% rename from examples/Services in the NFDI4Earth.rq rename to docs/queries/examples/Services in the NFDI4Earth.rq diff --git a/docs/queries/metrics/GM001.rq b/docs/queries/metrics/GM001.rq new file mode 100644 index 0000000000000000000000000000000000000000..95b0c14d9ba00b62d811b35265ab92108eba8d5e --- /dev/null +++ b/docs/queries/metrics/GM001.rq @@ -0,0 +1,6 @@ +# The number of instances in a graph + +SELECT (COUNT(?instance) AS ?numInstances) +WHERE { + ?instance a [] . +} diff --git a/docs/queries/metrics/GM002_1.rq b/docs/queries/metrics/GM002_1.rq new file mode 100644 index 0000000000000000000000000000000000000000..0bdd96a5455d788961d33200f410ce2b6b8d1bfb --- /dev/null +++ b/docs/queries/metrics/GM002_1.rq @@ -0,0 +1,13 @@ +# Total number of assertions (between entities & literals) +# +# Explanation: +# ?subject ?predicate ?object searches through all triples (assertions) in the dataset. +# COUNT(*) AS ?numAssertions counts the number of triples. +# +# This query returns the total number of edges in the graph, regardless of +# whether they exist between entities or between entities and literals. + +SELECT (COUNT(*) AS ?numAssertions) +WHERE { + ?subject ?predicate ?object . +} diff --git a/docs/queries/metrics/GM002_2.rq b/docs/queries/metrics/GM002_2.rq new file mode 100644 index 0000000000000000000000000000000000000000..a20aee9e3d8517c584861ba86c3d39c7ae0ccc67 --- /dev/null +++ b/docs/queries/metrics/GM002_2.rq @@ -0,0 +1,12 @@ +# The number of assertions between entities (without literals) +# +# Explanation: +# ?subject ?predicate ?object searches through all triples. +# FILTER(isIRI(?object)) checks if the object is an IRI (i.e. an entity) and not a literal (e.g. not a string, date, or number). +# COUNT(*) AS ?numEntityEdges counts the number of corresponding edges. + +SELECT (COUNT(*) AS ?numEntityEdges) +WHERE { + ?subject ?predicate ?object . + FILTER(isIRI(?object)) # Only objects that are URIs (i.e. entities) +} diff --git a/docs/queries/metrics/GM002_3.rq b/docs/queries/metrics/GM002_3.rq new file mode 100644 index 0000000000000000000000000000000000000000..ec6ab145fc635da24af274560fa462189b4fd7e9 --- /dev/null +++ b/docs/queries/metrics/GM002_3.rq @@ -0,0 +1,12 @@ +# The number of assertions between literals (without entities) +# +# Explanation: +# ?subject ?predicate ?object searches through all triples. +# FILTER(isLiteral(?object)) ensures that only edges are counted where the object is a literal (not an IRI). +# COUNT(*) AS ?numLiteralAssertions counts the number of corresponding edges. + +SELECT (COUNT(*) AS ?numLiteralAssertions) +WHERE { + ?subject ?predicate ?object . + FILTER(isLiteral(?object)) # Only objects that are literals +} diff --git a/docs/queries/metrics/GM003_1.rq b/docs/queries/metrics/GM003_1.rq new file mode 100644 index 0000000000000000000000000000000000000000..db52f5b5d54e67a1cb85a6822f3c5c3f220078f1 --- /dev/null +++ b/docs/queries/metrics/GM003_1.rq @@ -0,0 +1,34 @@ +# The average linkage degree +# (i.e.: how many assertions per entity does the graph contain) +# +# WARNING: This query can be very resource-intensive and slow on large datasets! +# It might even timeout or crash for graphs with millions of triples. +# Consider using batch processing with LIMIT and OFFSET for large datasets. +# +# Explanation: +# First count outgoing edges per entity +# Then count incoming edges per entity +# Add both counts and calculate average + +SELECT (AVG(?totalDegree) as ?avgLinkageDegree) +WHERE { + { + SELECT ?entity ((?outDegree + ?inDegree) as ?totalDegree) + WHERE { + { + SELECT ?entity (COUNT(*) as ?outDegree) + WHERE { + ?entity ?p ?o . + } + GROUP BY ?entity + } + { + SELECT ?entity (COUNT(*) as ?inDegree) + WHERE { + ?s ?p ?entity . + } + GROUP BY ?entity + } + } + } +} diff --git a/docs/queries/metrics/GM003_2.rq b/docs/queries/metrics/GM003_2.rq new file mode 100644 index 0000000000000000000000000000000000000000..2111a124ea6bcd2b8ccdaba59bb0643bcc66bc0b --- /dev/null +++ b/docs/queries/metrics/GM003_2.rq @@ -0,0 +1,39 @@ +# The average linkage degree with batch processing +# (i.e.: how many assertions per entity does the graph contain) +# +# Explanation: +# This is an alternative version as it enables a limitation of +# analysed entities. +# We have found that starting with a limit of 8,000,000 entities, +# the results begin to provide meaningful outcome, which does not change +# significantly with a higher limit. + +SELECT (AVG(?totalDegree) as ?avgLinkageDegree) +WHERE { + { + SELECT ?entity ((?outDegree + ?inDegree) as ?totalDegree) + WHERE { + { + SELECT DISTINCT ?entity + WHERE { + { ?entity ?p ?o } UNION { ?s ?p ?entity } + } + LIMIT 8000000 # Process limited entities at a time + } + { + SELECT ?entity (COUNT(*) as ?outDegree) + WHERE { + ?entity ?p ?o . + } + GROUP BY ?entity + } + { + SELECT ?entity (COUNT(*) as ?inDegree) + WHERE { + ?s ?p ?entity . + } + GROUP BY ?entity + } + } + } +} \ No newline at end of file diff --git a/docs/queries/metrics/GM003_3.rq b/docs/queries/metrics/GM003_3.rq new file mode 100644 index 0000000000000000000000000000000000000000..3f1028abdeca359d3eaf983725555130a156bdbb --- /dev/null +++ b/docs/queries/metrics/GM003_3.rq @@ -0,0 +1,25 @@ +# The average outgoing linkage degree +# (i.e.: how many outgoing assertions per entity does the graph contain) +# +# Explanation: +# This is a simplified version that only counts outgoing edges +# Limited to 1000 entities for testing purposes + +SELECT (AVG(?outDegree) as ?avgOutgoingLinkageDegree) +WHERE { + { + SELECT ?entity (COUNT(*) as ?outDegree) + WHERE { + { + SELECT DISTINCT ?entity + WHERE { + ?entity ?p ?o . + } + LIMIT 1000 # Process 1000 entities at a time + OFFSET 0 # Start with first batch + } + ?entity ?p ?o . + } + GROUP BY ?entity + } +} \ No newline at end of file diff --git a/docs/queries/metrics/GM003_4.rq b/docs/queries/metrics/GM003_4.rq new file mode 100644 index 0000000000000000000000000000000000000000..1a61203014a0daedf0f0eff783818a6a668791db --- /dev/null +++ b/docs/queries/metrics/GM003_4.rq @@ -0,0 +1,26 @@ +# The average incoming linkage degree +# (i.e.: how many incoming assertions per entity does the graph contain) +# +# Explanation: +# Simple version that only counts incoming edges per entity +# No complex sub-selects needed for this case +# Limited to 1000 entities for testing purposes + +SELECT (AVG(?inDegree) as ?avgIncomingLinkageDegree) +WHERE { + { + SELECT ?entity (COUNT(*) as ?inDegree) + WHERE { + { + SELECT DISTINCT ?entity + WHERE { + ?s ?p ?entity . + } + LIMIT 1000 # Process 1000 entities at a time + OFFSET 0 # Start with first batch + } + ?s ?p ?entity . + } + GROUP BY ?entity + } +} \ No newline at end of file diff --git a/docs/queries/metrics/GM004_1.rq b/docs/queries/metrics/GM004_1.rq new file mode 100644 index 0000000000000000000000000000000000000000..5c2ffa780e4763bd6947e10c75e0df324c763c63 --- /dev/null +++ b/docs/queries/metrics/GM004_1.rq @@ -0,0 +1,7 @@ +# Count Outgoing Edges Per Node + +SELECT ?source (COUNT(?outgoing) as ?outEdges) +WHERE { + ?source ?p ?outgoing . +} +GROUP BY ?source diff --git a/docs/queries/metrics/GM004_2.rq b/docs/queries/metrics/GM004_2.rq new file mode 100644 index 0000000000000000000000000000000000000000..d7ff687a228817d3f0b0dbf0f57ae8b36648453c --- /dev/null +++ b/docs/queries/metrics/GM004_2.rq @@ -0,0 +1,6 @@ +# Returns a single number representing the cleaned count of unique outging nodes in the graph. + +SELECT (COUNT(DISTINCT ?source) as ?uniqueEdges) +WHERE { + ?source ?p ?outgoing . +} \ No newline at end of file diff --git a/docs/queries/metrics/GM004_3.rq b/docs/queries/metrics/GM004_3.rq new file mode 100644 index 0000000000000000000000000000000000000000..65391cc9836cf9fa176f59ae884993437c535d11 --- /dev/null +++ b/docs/queries/metrics/GM004_3.rq @@ -0,0 +1,6 @@ +# Returns a single number representing the median position of outgoing edges + +SELECT (COUNT(DISTINCT ?source) as ?uniqueNodes) / 2 +WHERE { + ?source ?p ?outgoing . +} \ No newline at end of file diff --git a/docs/queries/metrics/GM004_4.rq b/docs/queries/metrics/GM004_4.rq new file mode 100644 index 0000000000000000000000000000000000000000..4249889e8bbc8dd732fd6cc0435dfccd3fd39094 --- /dev/null +++ b/docs/queries/metrics/GM004_4.rq @@ -0,0 +1,14 @@ +# This SPARQL query calculates the median out-edge in a graph. +# The placeholder {median_position} must be replaced with the actual position of the median. + +SELECT ?outEdges as ?outMedian +WHERE { + SELECT ?source (COUNT(?outgoing) as ?outEdges) + WHERE { + ?source ?p ?outgoing . + } + GROUP BY ?source + ORDER BY ASC(?outEdges) +} +OFFSET {median_position} +LIMIT 1 diff --git a/docs/queries/metrics/GM004_5.rq b/docs/queries/metrics/GM004_5.rq new file mode 100644 index 0000000000000000000000000000000000000000..1dcb762b9c9e70d2e5168081f0988fc9eddb041e --- /dev/null +++ b/docs/queries/metrics/GM004_5.rq @@ -0,0 +1,9 @@ +# Query to find the node with the minimum of outgoing edges in the graph + +SELECT COUNT(?outgoing) as ?outEdges +WHERE { + ?source ?p ?outgoing . +} +GROUP BY ?source +ORDER BY ASC(?outEdges) +LIMIT 1 \ No newline at end of file diff --git a/docs/queries/metrics/GM004_6.rq b/docs/queries/metrics/GM004_6.rq new file mode 100644 index 0000000000000000000000000000000000000000..6105231d800a9c0ce503f33a00998fb35cbfa557 --- /dev/null +++ b/docs/queries/metrics/GM004_6.rq @@ -0,0 +1,9 @@ +# Query to find the node with the maximum of outgoing edges in the graph + +SELECT COUNT(?outgoing) as ?outEdges +WHERE { + ?source ?p ?outgoing . +} +GROUP BY ?source +ORDER BY DESC(?outEdges) +LIMIT 1 \ No newline at end of file diff --git a/docs/queries/metrics/GM005_1.rq b/docs/queries/metrics/GM005_1.rq new file mode 100644 index 0000000000000000000000000000000000000000..e4440f0ce38648023baf6d780a10cb9e721a51c8 --- /dev/null +++ b/docs/queries/metrics/GM005_1.rq @@ -0,0 +1,8 @@ +# Count Incoming Edges Per Node + +SELECT ?target (COUNT(?incoming) as ?inEdges) +WHERE { + ?incoming ?p ?target . +} +GROUP BY ?target +ORDER BY ASC(?inEdges) diff --git a/docs/queries/metrics/GM005_2.rq b/docs/queries/metrics/GM005_2.rq new file mode 100644 index 0000000000000000000000000000000000000000..3c319e1a51896c20ebd44887da5622dba89d6370 --- /dev/null +++ b/docs/queries/metrics/GM005_2.rq @@ -0,0 +1,6 @@ +# Returns a single number representing the cleaned count of unique incoming nodes in the graph. + +SELECT (COUNT(DISTINCT ?target) as ?uniqueEdges) +WHERE { + ?incoming ?p ?target . +} \ No newline at end of file diff --git a/docs/queries/metrics/GM005_3.rq b/docs/queries/metrics/GM005_3.rq new file mode 100644 index 0000000000000000000000000000000000000000..3c386ee58f97418fd4141dcecde8a61d1ec55bff --- /dev/null +++ b/docs/queries/metrics/GM005_3.rq @@ -0,0 +1,6 @@ +# Returns a single number representing the median position of incoming edges + +SELECT (COUNT(DISTINCT ?target) as ?uniqueEdges) / 2 +WHERE { + ?incoming ?p ?target . +} \ No newline at end of file diff --git a/docs/queries/metrics/GM005_4.rq b/docs/queries/metrics/GM005_4.rq new file mode 100644 index 0000000000000000000000000000000000000000..952626e90dfea9d0094559757ba530296d4919f4 --- /dev/null +++ b/docs/queries/metrics/GM005_4.rq @@ -0,0 +1,14 @@ +# This SPARQL query calculates the median in-edge a graph. +# The placeholder {median_position} must be replaced with the actual position of the median. + +SELECT ?inEdges as ?inMedian +WHERE { + SELECT ?target (COUNT(?incoming) as ?inEdges) + WHERE { + ?incoming ?p ?target . + } + GROUP BY ?target + ORDER BY ASC(?inEdges) +} +OFFSET {median_position} +LIMIT 1 diff --git a/docs/queries/metrics/GM005_5.rq b/docs/queries/metrics/GM005_5.rq new file mode 100644 index 0000000000000000000000000000000000000000..7465b95c0457c2a36f992aba1d4bb440fe2b3982 --- /dev/null +++ b/docs/queries/metrics/GM005_5.rq @@ -0,0 +1,9 @@ +# Query to find the node with the minimum of incoming edges in the graph + +SELECT COUNT(?incoming) as ?inEdges +WHERE { + ?incoming ?p ?target . +} +GROUP BY ?target +ORDER BY ASC(?inEdges) +LIMIT 1 diff --git a/docs/queries/metrics/GM005_6.rq b/docs/queries/metrics/GM005_6.rq new file mode 100644 index 0000000000000000000000000000000000000000..f01a1dcee6139b4f66e484ba75306f95fb21120f --- /dev/null +++ b/docs/queries/metrics/GM005_6.rq @@ -0,0 +1,9 @@ +# Query to find the node with the maximum of incoming edges in the graph + +SELECT COUNT(?incoming) as ?inEdges +WHERE { + ?incoming ?p ?target . +} +GROUP BY ?target +ORDER BY DESC(?inEdges) +LIMIT 1 diff --git a/docs/queries/metrics/RF_001.rq b/docs/queries/metrics/RF_001.rq new file mode 100644 index 0000000000000000000000000000000000000000..9ccb05d6ae83fcb5dcd310e32428b9cf88210a19 --- /dev/null +++ b/docs/queries/metrics/RF_001.rq @@ -0,0 +1,23 @@ +# This query counts the number of distinct RDFS and OWL classes, +# excluding built-in classes from RDF/RDFS/OWL vocabularies + +PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> +PREFIX owl: <http://www.w3.org/2002/07/owl#> + +SELECT (COUNT(DISTINCT ?class) AS ?numberOfClasses) +WHERE { + { + # Count RDFS classes + ?class a rdfs:Class . + } + UNION + { + # Count OWL classes + ?class a owl:Class . + } + + # Filter out built-in classes from RDF/RDFS/OWL vocabularies + FILTER(!STRSTARTS(STR(?class), "http://www.w3.org/2002/07/owl")) + FILTER(!STRSTARTS(STR(?class), "http://www.w3.org/2000/01/rdf-schema")) + FILTER(!STRSTARTS(STR(?class), "http://www.w3.org/1999/02/22-rdf-syntax-ns")) +} diff --git a/docs/queries/metrics/RF_002_1.rq b/docs/queries/metrics/RF_002_1.rq new file mode 100644 index 0000000000000000000000000000000000000000..423b12989098d3efa374f5f9706ce83f873f5d76 --- /dev/null +++ b/docs/queries/metrics/RF_002_1.rq @@ -0,0 +1,25 @@ +# This SPARQL query counts the total number of distinct properties, +# excluding built-in properties + +PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> +PREFIX owl: <http://www.w3.org/2002/07/owl#> + +SELECT (COUNT(DISTINCT ?property) AS ?totalProperties) +WHERE { + { + ?property a rdf:Property . + } + UNION + { + ?property a owl:ObjectProperty . + } + UNION + { + ?property a owl:DatatypeProperty . + } + + # Filter for built-in properties + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/2002/07/owl")) + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/2000/01/rdf-schema")) + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/1999/02/22-rdf-syntax-ns")) +} \ No newline at end of file diff --git a/docs/queries/metrics/RF_002_2.rq b/docs/queries/metrics/RF_002_2.rq new file mode 100644 index 0000000000000000000000000000000000000000..4338eb78e0471d627e3ee1aaf3bac34ef938c103 --- /dev/null +++ b/docs/queries/metrics/RF_002_2.rq @@ -0,0 +1,14 @@ +# This SPARQL query counts the number of distinct object properties, +# excluding built-in properties + +PREFIX owl: <http://www.w3.org/2002/07/owl#> + +SELECT (COUNT(DISTINCT ?property) AS ?objectProperties) +WHERE { + ?property a owl:ObjectProperty . + + # Filter für eingebaute Properties + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/2002/07/owl")) + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/2000/01/rdf-schema")) + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/1999/02/22-rdf-syntax-ns")) +} \ No newline at end of file diff --git a/docs/queries/metrics/RF_002_3.rq b/docs/queries/metrics/RF_002_3.rq new file mode 100644 index 0000000000000000000000000000000000000000..1fa9bcda56be2f4dbe048cbbe7e60c7e715d023d --- /dev/null +++ b/docs/queries/metrics/RF_002_3.rq @@ -0,0 +1,14 @@ +# This SPARQL query counts the number of distinct datatype properties, +# excluding built-in properties + +PREFIX owl: <http://www.w3.org/2002/07/owl#> + +SELECT (COUNT(DISTINCT ?property) AS ?datatypeProperties) +WHERE { + ?property a owl:DatatypeProperty . + + # Filter out built-in properties + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/2002/07/owl")) + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/2000/01/rdf-schema")) + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/1999/02/22-rdf-syntax-ns")) +} \ No newline at end of file diff --git a/docs/queries/metrics/RF_002_4.rq b/docs/queries/metrics/RF_002_4.rq new file mode 100644 index 0000000000000000000000000000000000000000..82ebe448bea7a5213cc87790cb5131f970897086 --- /dev/null +++ b/docs/queries/metrics/RF_002_4.rq @@ -0,0 +1,29 @@ +# This SPARQL query counts the number of distinct properties with a +# specified domain, excluding built-in properties + +PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> +PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> +PREFIX owl: <http://www.w3.org/2002/07/owl#> + +SELECT (COUNT(DISTINCT ?property) AS ?propertiesWithDomain) +WHERE { + { + ?property a rdf:Property ; + rdfs:domain ?domain . + } + UNION + { + ?property a owl:ObjectProperty ; + rdfs:domain ?domain . + } + UNION + { + ?property a owl:DatatypeProperty ; + rdfs:domain ?domain . + } + + # Filter out built-in properties + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/2002/07/owl")) + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/2000/01/rdf-schema")) + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/1999/02/22-rdf-syntax-ns")) +} diff --git a/docs/queries/metrics/RF_002_5.rq b/docs/queries/metrics/RF_002_5.rq new file mode 100644 index 0000000000000000000000000000000000000000..4fea73cd236a4ce98b6be0bf8967de8c6b8a334e --- /dev/null +++ b/docs/queries/metrics/RF_002_5.rq @@ -0,0 +1,29 @@ +# This query counts the number of distinct properties with a specified range, +# excluding built-in properties + +PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> +PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> +PREFIX owl: <http://www.w3.org/2002/07/owl#> + +SELECT (COUNT(DISTINCT ?property) AS ?propertiesWithRange) +WHERE { + { + ?property a rdf:Property ; + rdfs:range ?range . + } + UNION + { + ?property a owl:ObjectProperty ; + rdfs:range ?range . + } + UNION + { + ?property a owl:DatatypeProperty ; + rdfs:range ?range . + } + + # Filter out built-in properties + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/2002/07/owl")) + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/2000/01/rdf-schema")) + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/1999/02/22-rdf-syntax-ns")) +} diff --git a/docs/queries/metrics/RF_002_6.rq b/docs/queries/metrics/RF_002_6.rq new file mode 100644 index 0000000000000000000000000000000000000000..a4bfec1b0610a95d64f2d7702927af27a18e652f --- /dev/null +++ b/docs/queries/metrics/RF_002_6.rq @@ -0,0 +1,45 @@ +# This SPARQL query counts various types of properties in an RDF dataset, +# excluding built-in properties. + +PREFIX owl: <http://www.w3.org/2002/07/owl#> +PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> + +SELECT + (COUNT(DISTINCT ?property) AS ?totalProperties) + (COUNT(DISTINCT ?objectProperty) AS ?objectProperties) + (COUNT(DISTINCT ?datatypeProperty) AS ?datatypeProperties) + (COUNT(DISTINCT ?withDomain) AS ?propertiesWithDomain) + (COUNT(DISTINCT ?withRange) AS ?propertiesWithRange) +WHERE { + { + # Count RDF/RDFS properties + ?property a rdf:Property . + } + UNION + { + # Count OWL Object Properties + ?property a owl:ObjectProperty . + BIND(?property AS ?objectProperty) + } + UNION + { + # Count OWL Datatype Properties + ?property a owl:DatatypeProperty . + BIND(?property AS ?datatypeProperty) + } + + # Filter out built-in properties + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/2002/07/owl")) + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/2000/01/rdf-schema")) + FILTER(!STRSTARTS(STR(?property), "http://www.w3.org/1999/02/22-rdf-syntax-ns")) + + # Check for domain and range definitions + OPTIONAL { + ?property rdfs:domain ?domain . + BIND(?property AS ?withDomain) + } + OPTIONAL { + ?property rdfs:range ?range . + BIND(?property AS ?withRange) + } +} \ No newline at end of file diff --git a/docs/queries/metrics/RF_003.rq b/docs/queries/metrics/RF_003.rq new file mode 100644 index 0000000000000000000000000000000000000000..601397c4741dcd61882ab776fa2328a7a2f942fb --- /dev/null +++ b/docs/queries/metrics/RF_003.rq @@ -0,0 +1,33 @@ +# This SPARQL query calculates the average hierarchy depth of all classes +# in an RDF dataset, excluding built-in RDF and OWL classes. + +PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> +PREFIX owl: <http://www.w3.org/2002/07/owl#> + +SELECT (AVG(?depth) AS ?averageHierarchyDepth) +WHERE { + { + SELECT ?class (COUNT(?superClass) AS ?depth) + WHERE { + # Find all classes + { + ?class a rdfs:Class . + } UNION { + ?class a owl:Class . + } + + # Ensure class exists + ?class rdfs:subClassOf ?directSuper . + + # Calculate path to all superclasses + ?directSuper rdfs:subClassOf ?superClass . + + # Filter out built-in classes + FILTER(!STRSTARTS(STR(?class), "http://www.w3.org/2002/07/owl")) + FILTER(!STRSTARTS(STR(?class), "http://www.w3.org/2000/01/rdf-schema")) + FILTER(!STRSTARTS(STR(?superClass), "http://www.w3.org/2002/07/owl")) + FILTER(!STRSTARTS(STR(?superClass), "http://www.w3.org/2000/01/rdf-schema")) + } + GROUP BY ?class + } +} \ No newline at end of file diff --git a/docs/queries/metrics/RF_004.rq b/docs/queries/metrics/RF_004.rq new file mode 100644 index 0000000000000000000000000000000000000000..aa0e49dc51348338bc0ba279693940b0fec30820 --- /dev/null +++ b/docs/queries/metrics/RF_004.rq @@ -0,0 +1,43 @@ +# This SPARQL query calculates the average branching factor (widht) of classes in +# an RDF dataset. +# The branching factor is defined as the average number of direct +# subclasses per class. +# The query first identifies all classes (both rdfs:Class and owl:Class) +# and counts their direct subclasses, excluding built-in properties + +PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> +PREFIX owl: <http://www.w3.org/2002/07/owl#> + +SELECT (AVG(?subClassCount) AS ?averageBranchingFactor) +WHERE { + { + SELECT ?class (COUNT(DISTINCT ?subClass) AS ?subClassCount) + WHERE { + # Find all classes that have subclasses + { + ?class a rdfs:Class . + } UNION { + ?class a owl:Class . + } + + # Count direct subclasses + OPTIONAL { + ?subClass rdfs:subClassOf ?class . + + # Ensure subClass is actually a class + { + ?subClass a rdfs:Class . + } UNION { + ?subClass a owl:Class . + } + } + + # Filter out built-in classes + FILTER(!STRSTARTS(STR(?class), "http://www.w3.org/2002/07/owl")) + FILTER(!STRSTARTS(STR(?class), "http://www.w3.org/2000/01/rdf-schema")) + FILTER(!STRSTARTS(STR(?subClass), "http://www.w3.org/2002/07/owl")) + FILTER(!STRSTARTS(STR(?subClass), "http://www.w3.org/2000/01/rdf-schema")) + } + GROUP BY ?class + } +} \ No newline at end of file diff --git a/docs/queries/metrics/RF_005.rq b/docs/queries/metrics/RF_005.rq new file mode 100644 index 0000000000000000000000000000000000000000..b256108b364083f0b9338006f90f2639367905d7 --- /dev/null +++ b/docs/queries/metrics/RF_005.rq @@ -0,0 +1,40 @@ +# This SPARQL query retrieves various counts of OWL restrictions +# from a dataset, excluding built-in properties + +PREFIX owl: <http://www.w3.org/2002/07/owl#> +PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> + +SELECT + (COUNT(DISTINCT ?restriction) as ?totalRestrictions) + (COUNT(DISTINCT ?someValues) as ?someValuesFrom) + (COUNT(DISTINCT ?allValues) as ?allValuesFrom) + (COUNT(DISTINCT ?hasValue) as ?hasValueRestrictions) + (COUNT(DISTINCT ?cardinality) as ?cardinalityRestrictions) +WHERE { + { + ?restriction a owl:Restriction . + FILTER(!STRSTARTS(STR(?restriction), "http://www.w3.org/")) + } + OPTIONAL { + ?someValues a owl:Restriction ; + owl:someValuesFrom ?target1 . + } + OPTIONAL { + ?allValues a owl:Restriction ; + owl:allValuesFrom ?target2 . + } + OPTIONAL { + ?hasValue a owl:Restriction ; + owl:hasValue ?target3 . + } + OPTIONAL { + ?cardinality a owl:Restriction . + { + ?cardinality owl:cardinality ?card1 . + } UNION { + ?cardinality owl:minCardinality ?card2 . + } UNION { + ?cardinality owl:maxCardinality ?card3 . + } + } +} \ No newline at end of file diff --git a/docs/queries/metrics/RF_006.rq b/docs/queries/metrics/RF_006.rq new file mode 100644 index 0000000000000000000000000000000000000000..2ba561308ccff48a5d9acf6f04bbedbe2987bb4b --- /dev/null +++ b/docs/queries/metrics/RF_006.rq @@ -0,0 +1,21 @@ +# This query counts the total number of distinct logical axioms in the dataset, +# excluding built-in properties. + +PREFIX owl: <http://www.w3.org/2002/07/owl#> + +SELECT + (COUNT(DISTINCT ?axiom) as ?totalLogicalAxioms) +WHERE { + VALUES ?property { + owl:equivalentClass + owl:disjointWith + owl:complementOf + owl:intersectionOf + owl:unionOf + } + + ?axiom ?property ?target . + + # Filter for built-in properties + FILTER(!STRSTARTS(STR(?axiom), "http://www.w3.org/")) +} \ No newline at end of file diff --git a/docs/queries/metrics/RM001_instances_template.rq b/docs/queries/metrics/RM001_instances_template.rq new file mode 100644 index 0000000000000000000000000000000000000000..89c4b84b8789d37f0d5cf936dee1674cf9dd6584 --- /dev/null +++ b/docs/queries/metrics/RM001_instances_template.rq @@ -0,0 +1,11 @@ +# Count all instances of the given entity +# +# Explanation: +# ?resource a {resource_type} finds all resources that are of type {resource_type} +# COUNT(DISTINCT ?resource) counts unique resource instances +# We use DISTINCT to avoid counting duplicates if a resource has multiple types + +SELECT (COUNT(DISTINCT ?resource) AS ?resourceCount) +WHERE { + ?resource a {resource_type} . +} diff --git a/docs/queries/metrics/RM002_assertions_template.rq b/docs/queries/metrics/RM002_assertions_template.rq new file mode 100644 index 0000000000000000000000000000000000000000..33c105c41bb2c846d2813573e2c4bc65c584ba65 --- /dev/null +++ b/docs/queries/metrics/RM002_assertions_template.rq @@ -0,0 +1,6 @@ +# Total number of assertions (for resource type: {resource_type}) + +SELECT (COUNT(*) AS ?numAssertions) +WHERE { + ?subject ?predicate {resource_type} . +} diff --git a/docs/queries/metrics/RM003_linkage_template.rq b/docs/queries/metrics/RM003_linkage_template.rq new file mode 100644 index 0000000000000000000000000000000000000000..08064de5b5dba8a45b1d4df8b42303c913515e9b --- /dev/null +++ b/docs/queries/metrics/RM003_linkage_template.rq @@ -0,0 +1,30 @@ +# The average linkage degree for specific resource type only +# (i.e.: how many assertions per {resource_type} does the graph contain) + +SELECT (AVG(?totalDegree) as ?avgLinkageDegree) +WHERE { + { + SELECT ?entity ((?outDegree + ?inDegree) as ?totalDegree) + WHERE { + # Only consider entities that are of given resource + ?entity a {resource_type} . + + { + SELECT ?entity (COUNT(*) as ?outDegree) + WHERE { + ?entity a {resource_type} . + ?entity ?p ?o . + } + GROUP BY ?entity + } + { + SELECT ?entity (COUNT(*) as ?inDegree) + WHERE { + ?entity a {resource_type} . + ?s ?p ?entity . # Here we count the connections pointing to the resource + } + GROUP BY ?entity + } + } + } +} diff --git a/docs/queries/metrics/RM004_1_out_edges_total_template.rq b/docs/queries/metrics/RM004_1_out_edges_total_template.rq new file mode 100644 index 0000000000000000000000000000000000000000..5788a6db52e69708dfe3a9962f0d805710e47ec8 --- /dev/null +++ b/docs/queries/metrics/RM004_1_out_edges_total_template.rq @@ -0,0 +1,7 @@ +# Returns a single number representing the cleaned count of unique outging nodes in the graph. + +SELECT COUNT(DISTINCT ?source) as ?uniqueEdges +WHERE { + ?source a {resource_type} . + ?source ?p ?outgoing . +} \ No newline at end of file diff --git a/docs/queries/metrics/RM004_2_out_edges_min_template.rq b/docs/queries/metrics/RM004_2_out_edges_min_template.rq new file mode 100644 index 0000000000000000000000000000000000000000..3fb1bff1676105fc413268ea869a0411ded9e4d4 --- /dev/null +++ b/docs/queries/metrics/RM004_2_out_edges_min_template.rq @@ -0,0 +1,10 @@ +# This SPARQL query calculates the maximuim out-edge for {resource_type} only. + +SELECT COUNT(?outgoing) as ?outEdges +WHERE { +?source a {resource_type} . +?source ?p ?outgoing . +} +GROUP BY ?source +ORDER BY ASC(?outEdges) +LIMIT 1 \ No newline at end of file diff --git a/docs/queries/metrics/RM004_3_out_edges_median_template.rq b/docs/queries/metrics/RM004_3_out_edges_median_template.rq new file mode 100644 index 0000000000000000000000000000000000000000..7f0c784f72cd1d6bbb8ea814f5d9c3653e090b45 --- /dev/null +++ b/docs/queries/metrics/RM004_3_out_edges_median_template.rq @@ -0,0 +1,15 @@ +# This SPARQL query calculates the median out-edge for {resource_type} only. +# The placeholder {median_position} must be replaced with the actual position of the median. + +SELECT ?outEdges as ?outMedian +WHERE { + SELECT ?source (COUNT(?outgoing) as ?outEdges) + WHERE { + ?source a {resource_type} . + ?source ?p ?outgoing . + } + GROUP BY ?source + ORDER BY ASC(?outEdges) +} +OFFSET {median_position} +LIMIT 1 \ No newline at end of file diff --git a/docs/queries/metrics/RM004_4_out_edges_max_template.rq b/docs/queries/metrics/RM004_4_out_edges_max_template.rq new file mode 100644 index 0000000000000000000000000000000000000000..3bf47398fe8fc2d0fdd9aa0cc8a263e1bb2478fd --- /dev/null +++ b/docs/queries/metrics/RM004_4_out_edges_max_template.rq @@ -0,0 +1,10 @@ +# This SPARQL query calculates the maximum out-edge for {resource_type} only. + +SELECT COUNT(?outgoing) as ?outEdges +WHERE { +?source a {resource_type} . +?source ?p ?outgoing . +} +GROUP BY ?source +ORDER BY DESC(?outEdges) +LIMIT 1 \ No newline at end of file diff --git a/docs/queries/metrics/RM005_1_in_edges_total_template.rq b/docs/queries/metrics/RM005_1_in_edges_total_template.rq new file mode 100644 index 0000000000000000000000000000000000000000..3a04008a1aaa944a60e3d67e52f923ecacaf020b --- /dev/null +++ b/docs/queries/metrics/RM005_1_in_edges_total_template.rq @@ -0,0 +1,7 @@ +# Returns a single number representing the cleaned count of unique incoming edges for {resource_type} only. + +SELECT (COUNT(DISTINCT ?target) as ?uniqueEdges) +WHERE { + ?incoming ?p ?target . + ?target a {resource_type} . +} \ No newline at end of file diff --git a/docs/queries/metrics/RM005_2_in_edges_min_template.rq b/docs/queries/metrics/RM005_2_in_edges_min_template.rq new file mode 100644 index 0000000000000000000000000000000000000000..afef730d90f980815ecebb93d3151aefe203d794 --- /dev/null +++ b/docs/queries/metrics/RM005_2_in_edges_min_template.rq @@ -0,0 +1,10 @@ +# This SPARQL query calculates the minimum incoming-edge for {resource_type} only. + +SELECT (COUNT(?incoming) as ?inEdges) +WHERE { +?target a {resource_type} . +?incoming ?p ?target . +} +GROUP BY ?target +ORDER BY ASC(?inEdges) +LIMIT 1 diff --git a/docs/queries/metrics/RM005_3_in_edges_median_template.rq b/docs/queries/metrics/RM005_3_in_edges_median_template.rq new file mode 100644 index 0000000000000000000000000000000000000000..3d0a28e08d83e3621efc128f44b78a1ee95f7ed4 --- /dev/null +++ b/docs/queries/metrics/RM005_3_in_edges_median_template.rq @@ -0,0 +1,15 @@ +# This SPARQL query calculates the median incoming-edge for {resource_type} only. +# The placeholder {median_position} must be replaced with the actual position of the median. + +SELECT ?inEdges as ?inMedian +WHERE { + SELECT ?target (COUNT(?incoming) as ?inEdges) + WHERE { + ?target a {resource_type} . + ?incoming ?p ?target . + } + GROUP BY ?target + ORDER BY ASC(?inEdges) +} +OFFSET {median_position} +LIMIT 1 diff --git a/docs/queries/metrics/RM005_4_in_edges_max_template.rq b/docs/queries/metrics/RM005_4_in_edges_max_template.rq new file mode 100644 index 0000000000000000000000000000000000000000..0b10af8b068979d3f83974b8a3ecd53fba858dc0 --- /dev/null +++ b/docs/queries/metrics/RM005_4_in_edges_max_template.rq @@ -0,0 +1,10 @@ +# This SPARQL query calculates the maximum incoming-edge for {resource_type} only. + +SELECT (COUNT(?incoming) as ?inEdges) +WHERE { +?target a {resource_type} . +?incoming ?p ?target . +} +GROUP BY ?target +ORDER BY DESC(?inEdges) +LIMIT 1 diff --git a/docs/queries/metrics/RM006_connectivity_template.rq b/docs/queries/metrics/RM006_connectivity_template.rq new file mode 100644 index 0000000000000000000000000000000000000000..113022d24ba1349caf597fa5e858278faa04c838 --- /dev/null +++ b/docs/queries/metrics/RM006_connectivity_template.rq @@ -0,0 +1,16 @@ +# Analysis of total connected resources for {resource_type} only. +# +# Explanation: +# Counts the number of resources that are connected to {resource_type} +# This gives us a metric for {resource_type} connectivity in the graph + +SELECT (COUNT(DISTINCT ?connected) as ?numConnectedResources) +WHERE { + ?resource a {resource_type} ; + ?property ?connected . + + # Ensure connected resource is not a literal + FILTER(isIRI(?connected)) + # Exclude self-references to entities + FILTER(?connected != ?resource) +} diff --git a/queries/AG001.rq b/docs/queries/questions/AG001.rq similarity index 100% rename from queries/AG001.rq rename to docs/queries/questions/AG001.rq diff --git a/queries/AG001_2.rq b/docs/queries/questions/AG001_2.rq similarity index 100% rename from queries/AG001_2.rq rename to docs/queries/questions/AG001_2.rq diff --git a/queries/AG002_1.rq b/docs/queries/questions/AG002_1.rq similarity index 100% rename from queries/AG002_1.rq rename to docs/queries/questions/AG002_1.rq diff --git a/queries/AG002_2.rq b/docs/queries/questions/AG002_2.rq similarity index 100% rename from queries/AG002_2.rq rename to docs/queries/questions/AG002_2.rq diff --git a/queries/AT001.rq b/docs/queries/questions/AT001.rq similarity index 100% rename from queries/AT001.rq rename to docs/queries/questions/AT001.rq diff --git a/queries/AT002_1.rq b/docs/queries/questions/AT002_1.rq similarity index 100% rename from queries/AT002_1.rq rename to docs/queries/questions/AT002_1.rq diff --git a/queries/AT002_2.rq b/docs/queries/questions/AT002_2.rq similarity index 100% rename from queries/AT002_2.rq rename to docs/queries/questions/AT002_2.rq diff --git a/queries/DA001.rq b/docs/queries/questions/DA001.rq similarity index 100% rename from queries/DA001.rq rename to docs/queries/questions/DA001.rq diff --git a/queries/DA002_1.rq b/docs/queries/questions/DA002_1.rq similarity index 100% rename from queries/DA002_1.rq rename to docs/queries/questions/DA002_1.rq diff --git a/queries/DA002_2.rq b/docs/queries/questions/DA002_2.rq similarity index 100% rename from queries/DA002_2.rq rename to docs/queries/questions/DA002_2.rq diff --git a/queries/DA003_1.rq b/docs/queries/questions/DA003_1.rq similarity index 100% rename from queries/DA003_1.rq rename to docs/queries/questions/DA003_1.rq diff --git a/queries/LH001.rq b/docs/queries/questions/LH001.rq similarity index 100% rename from queries/LH001.rq rename to docs/queries/questions/LH001.rq diff --git a/queries/LH002_1.rq b/docs/queries/questions/LH002_1.rq similarity index 100% rename from queries/LH002_1.rq rename to docs/queries/questions/LH002_1.rq diff --git a/queries/LH002_2.rq b/docs/queries/questions/LH002_2.rq similarity index 100% rename from queries/LH002_2.rq rename to docs/queries/questions/LH002_2.rq diff --git a/queries/LR001.rq b/docs/queries/questions/LR001.rq similarity index 100% rename from queries/LR001.rq rename to docs/queries/questions/LR001.rq diff --git a/queries/LR002_1.rq b/docs/queries/questions/LR002_1.rq similarity index 100% rename from queries/LR002_1.rq rename to docs/queries/questions/LR002_1.rq diff --git a/queries/LR002_2.rq b/docs/queries/questions/LR002_2.rq similarity index 100% rename from queries/LR002_2.rq rename to docs/queries/questions/LR002_2.rq diff --git a/queries/MS001.rq b/docs/queries/questions/MS001.rq similarity index 100% rename from queries/MS001.rq rename to docs/queries/questions/MS001.rq diff --git a/queries/MS002_1.rq b/docs/queries/questions/MS002_1.rq similarity index 100% rename from queries/MS002_1.rq rename to docs/queries/questions/MS002_1.rq diff --git a/queries/MS002_2.rq b/docs/queries/questions/MS002_2.rq similarity index 100% rename from queries/MS002_2.rq rename to docs/queries/questions/MS002_2.rq diff --git a/queries/OG001.rq b/docs/queries/questions/OG001.rq similarity index 100% rename from queries/OG001.rq rename to docs/queries/questions/OG001.rq diff --git a/queries/OG002_1.rq b/docs/queries/questions/OG002_1.rq similarity index 100% rename from queries/OG002_1.rq rename to docs/queries/questions/OG002_1.rq diff --git a/queries/OG002_2.rq b/docs/queries/questions/OG002_2.rq similarity index 100% rename from queries/OG002_2.rq rename to docs/queries/questions/OG002_2.rq diff --git a/queries/PE001.rq b/docs/queries/questions/PE001.rq similarity index 100% rename from queries/PE001.rq rename to docs/queries/questions/PE001.rq diff --git a/queries/PE002_1.rq b/docs/queries/questions/PE002_1.rq similarity index 100% rename from queries/PE002_1.rq rename to docs/queries/questions/PE002_1.rq diff --git a/queries/PE002_2.rq b/docs/queries/questions/PE002_2.rq similarity index 100% rename from queries/PE002_2.rq rename to docs/queries/questions/PE002_2.rq diff --git a/queries/REG001.rq b/docs/queries/questions/REG001.rq similarity index 100% rename from queries/REG001.rq rename to docs/queries/questions/REG001.rq diff --git a/queries/REG002_1.rq b/docs/queries/questions/REG002_1.rq similarity index 100% rename from queries/REG002_1.rq rename to docs/queries/questions/REG002_1.rq diff --git a/queries/REG002_2.rq b/docs/queries/questions/REG002_2.rq similarity index 100% rename from queries/REG002_2.rq rename to docs/queries/questions/REG002_2.rq diff --git a/queries/REP001.rq b/docs/queries/questions/REP001.rq similarity index 100% rename from queries/REP001.rq rename to docs/queries/questions/REP001.rq diff --git a/queries/REP002_1.rq b/docs/queries/questions/REP002_1.rq similarity index 100% rename from queries/REP002_1.rq rename to docs/queries/questions/REP002_1.rq diff --git a/queries/REP002_2.rq b/docs/queries/questions/REP002_2.rq similarity index 100% rename from queries/REP002_2.rq rename to docs/queries/questions/REP002_2.rq diff --git a/queries/RP001.rq b/docs/queries/questions/RP001.rq similarity index 100% rename from queries/RP001.rq rename to docs/queries/questions/RP001.rq diff --git a/queries/RP002_1.rq b/docs/queries/questions/RP002_1.rq similarity index 100% rename from queries/RP002_1.rq rename to docs/queries/questions/RP002_1.rq diff --git a/queries/RP002_2.rq b/docs/queries/questions/RP002_2.rq similarity index 100% rename from queries/RP002_2.rq rename to docs/queries/questions/RP002_2.rq diff --git a/queries/SC001.rq b/docs/queries/questions/SC001.rq similarity index 100% rename from queries/SC001.rq rename to docs/queries/questions/SC001.rq diff --git a/queries/SC002_1.rq b/docs/queries/questions/SC002_1.rq similarity index 100% rename from queries/SC002_1.rq rename to docs/queries/questions/SC002_1.rq diff --git a/queries/SC002_2.rq b/docs/queries/questions/SC002_2.rq similarity index 100% rename from queries/SC002_2.rq rename to docs/queries/questions/SC002_2.rq diff --git a/queries/TY001.rq b/docs/queries/questions/TY001.rq similarity index 100% rename from queries/TY001.rq rename to docs/queries/questions/TY001.rq diff --git a/queries/old/DR001_1.rq b/docs/queries/questions/old/DR001_1.rq similarity index 100% rename from queries/old/DR001_1.rq rename to docs/queries/questions/old/DR001_1.rq diff --git a/queries/old/OR001_1.rq b/docs/queries/questions/old/OR001_1.rq similarity index 100% rename from queries/old/OR001_1.rq rename to docs/queries/questions/old/OR001_1.rq diff --git a/queries/old/OR001_2.rq b/docs/queries/questions/old/OR001_2.rq similarity index 100% rename from queries/old/OR001_2.rq rename to docs/queries/questions/old/OR001_2.rq diff --git a/queries/old/OR002_1.rq b/docs/queries/questions/old/OR002_1.rq similarity index 100% rename from queries/old/OR002_1.rq rename to docs/queries/questions/old/OR002_1.rq diff --git a/queries/old/OR003_1.rq b/docs/queries/questions/old/OR003_1.rq similarity index 100% rename from queries/old/OR003_1.rq rename to docs/queries/questions/old/OR003_1.rq diff --git a/queries/old/OR004_1.rq b/docs/queries/questions/old/OR004_1.rq similarity index 100% rename from queries/old/OR004_1.rq rename to docs/queries/questions/old/OR004_1.rq diff --git a/queries/old/OR005_1.rq b/docs/queries/questions/old/OR005_1.rq similarity index 100% rename from queries/old/OR005_1.rq rename to docs/queries/questions/old/OR005_1.rq diff --git a/queries/old/OR006_1.rq b/docs/queries/questions/old/OR006_1.rq similarity index 100% rename from queries/old/OR006_1.rq rename to docs/queries/questions/old/OR006_1.rq diff --git a/docs/questions.md b/docs/questions.md new file mode 100644 index 0000000000000000000000000000000000000000..c71267666debac1caf58785634350b27ba43e0ff --- /dev/null +++ b/docs/questions.md @@ -0,0 +1,170 @@ +# Questions + +This is a collection of relevant questions and corresponding SPARQL-Queries, that answer those questions. + +The questions are grouped according to the different entities of interest +(datasets, organizations, ...). + +The entities appear in alphabetical order. The first query is useful to get an overview of all entities available in the Knowledge Hub. + +The questions listed below form, altogether, the **domain coverage** of the Knowledge Hub. For details, see the [NFDI4Earth Deliverable D4.3.2](https://zenodo.org/records/7950860). + +## Overview of the types of entities + +| ID | Question | Query/ies | +|---|---|---| +| TY001 | What are the types of entities available in the knowledge graph? | [TY001](queries/questions/TY001.rq)| + + + + +### Aggregator + +| ID | Question | Query/ies | +|---|---|---| +| AG001 | What are all entities of type Aggregator? | [AG001](queries/questions/AG001.rq)| +| AG001_2 | What are name and geometry of Aggregator? | [AG001_2](queries/questions/AG001_2.rq)| +| AG002_1 | What are all attributes available for the type "Aggregator"? | [AG002_1](queries/questions/AG002_1.rq)| +| AG002_2 | How many attributes are available for the type "Aggregator"? | [AG002_2](queries/questions/AG002_2.rq)| + +### Article + +| ID | Question | Query/ies | +|---|---|---| +| AT001 | What are all entities of type schema:Article? | [AT001](queries/questions/AT001.rq)| +| AT002_1 | What are all attributes available for the type "schema:Article"? | [AT002_1](queries/questions/AT002_1.rq)| +| AT002_2 | How many attributes are available for the type "schema:Article"? | [AT002_2](queries/questions/AT002_2.rq)| + +### Dataset + +| ID | Question | Query/ies | +|---|---|---| +| DA001 | What are all entities of type dcat:Dataset? | [DA001](queries/questions/DA001.rq)| +| DA002_1 | What are all attributes available for the type "dcat:Dataset"? | [DA002_1](queries/questions/DA002_1.rq)| +| DA002_2 | How many attributes are available for the type "dcat:Dataset"? | [DA002_2](queries/questions/DA002_2.rq)| +| DA003_1 | What are the datasets having the string 'world settlement footprint' in title or description? | [DA003_1](queries/questions/DA003_1.rq)| + +### LHBArticle + +| ID | Question | Query/ies | +|---|---|---| +| LH001 | What are all entities of type LHBArticle? | [LH001](queries/questions/LH001.rq)| +| LH002_1 | What are all attributes available for the type "LHBArticle"? | [LH002_1](queries/questions/LH002_1.rq)| +| LH002_2 | How many attributes are available for the type "LHBArticle"? | [LH002_2](queries/questions/LH002_2.rq)| + +### LearningResource + +| ID | Question | Query/ies | +|---|---|---| +| LR001 | What are all entities of type LearningResource? | [LR001](queries/questions/LR001.rq)| +| LR002_1 | What are all attributes available for the type "LearningResource"? | [LR002_1](queries/questions/LR002_1.rq)| +| LR002_2 | How many attributes are available for the type "LearningResource"? | [LR002_2](queries/questions/LR002_2.rq)| + +### MetadataStandard + +| ID | Question | Query/ies | +|---|---|---| +| MS001 | What are all entities of type MetadataStandard? | [MS001](queries/questions/MS001.rq)| +| MS002_1 | What are all attributes available for the type "MetadataStandard"? | [MS002_1](queries/questions/MS002_1.rq)| +| MS002_2 | How many attributes are available for the type "MetadataStandard"? | [MS002_2](queries/questions/MS002_2.rq)| + +### Organization + +| ID | Question | Query/ies | +|---|---|---| +| OG001 | What are all entities of type Organization? | [OG001](queries/questions/OG001.rq)| +| OG002_1 | What are all attributes available for the type "Organization"? | [OG002_1](queries/questions/OG002_1.rq)| +| OG002_2 | How many attributes are available for the type "Organization"? | [OG002_2](queries/questions/OG002_2.rq)| + +### Person + +| ID | Question | Query/ies | +|---|---|---| +| PE001 | What are all entities of type Person? | [PE001](queries/questions/PE001.rq)| +| PE002_1 | What are all attributes available for the type "Person"? | [PE002_1](queries/questions/PE002_1.rq)| +| PE002_2 | How many attributes are available for the type "Person"? | [PE002_2](queries/questions/PE002_2.rq)| + +### Registry + +| ID | Question | Query/ies | +|---|---|---| +| REG001 | What are all entities of type Registry? | [REG001](queries/questions/REG001.rq)| +| REG002_1 | What are all attributes available for the type "Registry"? | [REG002_1](queries/questions/REG002_1.rq)| +| REG002_2 | How many attributes are available for the type "Registry"? | [REG002_2](queries/questions/REG002_2.rq)| + +### Repository + +| ID | Question | Query/ies | +|---|---|---| +| REP001 | What are all entities of type Repository? | [REP001](queries/questions/REP001.rq)| +| REP002_1 | What are all attributes available for the type "Repository"? | [REP002_1](queries/questions/REP002_1.rq)| +| REP002_2 | How many attributes are available for the type "Repository"? | [REP002_2](queries/questions/REP002_2.rq)| + +### ResearchProject + +| ID | Question | Query/ies | +|---|---|---| +| RP001 | What are all entities of type ResearchProject? | [RP001](queries/questions/RP001.rq)| +| RP002_1 | What are all attributes available for the type "ResearchProject"? | [RP002_1](queries/questions/RP002_1.rq)| +| RP002_2 | How many attributes are available for the type "ResearchProject"? | [RP002_2](queries/questions/RP002_2.rq)| + + +### SoftwareSourceCode + +| ID | Question | Query/ies | +|---|---|---| +| SC001 | What are all entities of type SoftwareSourceCode? | [SC001](queries/questions/SC001.rq)| +| SC002_1 | What are all attributes available for the type "SoftwareSourceCode"? | [SC002_1](queries/questions/SC002_1.rq)| +| SC002_2 | How many attributes are available for the type "SoftwareSourceCode"? | [SC002_2](queries/questions/SC002_2.rq)| + + +<!--- Template for a new table (including first line) + +### EntityType + +| ID | Question | Query/ies | +|---|---|---| +| XX001 | What are all entities of type EntityType? | [XX001](queries/questions/XX001.rq)| + +--> + + +<!--- + + +### Organizations + +| ID | Question | Query/ies | +|---|---|---| +| OR001 | What is the URL of the homepage for the organization with the following name: 'Karlsruhe Institute of Technology'? | [OR001_1](queries/questions/OR001_1.rq),[OR001_2](queries/questions/OR001_2.rq) | +| OR002 | What is the URL of the homepage for the organization with the following ID: 'https://nfdi4earth-knowledgehub.geo.tu-dresden.de/api/objects/n4ekh/a38143be5e15bed94a20' | [OR003_1](queries/questions/OR003_1.rq) | +| OR003 | Which organizations have not defined any homepage? | [OR003_1](queries/questions/OR003_1.rq) | +| OR004 | Which services are published by the organization? | [OR004_1](queries/questions/OR004_1.rq) | +| OR005 | What is the geolocation of the organization called 'TU Dresden'? | [OR005_1](queries/questions/OR005_1.rq) | +| OR006 | What is the geolocation of all organizations, that are members of the NFDI4Earth consortium? | [OR006_1](queries/questions/OR006_1.rq) | + +### Repositories + +| ID | Question | Query/ies | +|----|----------|-----------| +| DR1 | At which repository can I archive my [geophysical] data of [2] GB?| [OR004_1](queries/questions/OR004_1.rq) | +| DR2 | What is the temporal coverage of a data repository?|| +| DR3 | What is the spatial coverage of a data repository?|| +| DR4 | What is the curation policy of the data repository?|| +| DR5 | Which licences are supported by the data repository?|| +| DR6 | Does the repository give identifiers for its ressources?|| +| DR7 | Which metadata harversting interface is supported by the repository?|| +| DR8 | Which type of (persistent) identifiers are used by the repository?|| +| DR9 | What is the thematic area/subject of a repository?|| +| DR10 | Limitations of data deposit at the repository?|| +| DR11 | When was the medatada for a given repository first collected/last updated?|| +| DR12 | Is the repository still available?|| +| DR13 | Which repository allows long term archiving?|| + +--> + +## Notes + +This question-based approach takes inspiration from the [GeoSPARQLBenchmark](https://github.com/OpenLinkSoftware/GeoSPARQLBenchmark). + +It is directly linked to the [Knowledge Hub landing page project](https://git.rwth-aachen.de/nfdi4earth/knowledgehub/kh_landingpage) as all the questions are taken to explain the basic idea and demonstrate usage of the [Knowledge Hub](https://knowledgehub.nfdi4earth.de). diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000000000000000000000000000000000000..329d821f82cc3899f88b175af780d9802253596b --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,28 @@ +site_name: NFDI4Earth - KnowledgeGraph - Questions & Metrics + +repo_url: https://git.rwth-aachen.de/nfdi4earth/knowledgehub/kh_questions + +theme: + name: material + language: en + logo: assets/NFDI4Earth_Symbol.png + favicon: assets/favicon.ico + features: + - search.highlight + - content.code.copy + palette: + - primary: blue + accent: cyan + +plugins: + - search + - awesome-nav + - macros: + module_name: docs/macros/main + +exclude_docs: + macros + +markdown_extensions: + - admonition + - pymdownx.details diff --git a/reports/metrics/complexity.json b/reports/metrics/complexity.json new file mode 100644 index 0000000000000000000000000000000000000000..aee2df80d284e6b9e55818cc2d651c91526e4f7c --- /dev/null +++ b/reports/metrics/complexity.json @@ -0,0 +1,91 @@ +{ + "structural_complexity": { + "name": "Structural Complexity", + "files": { + "classes": { + "name": "Number of Classes", + "file": "RF_001.rq", + "execute": true, + "weight": 0.2, + "result": "410", + "execution_time": 0.05, + "weighted_value": 82.0 + }, + "properties": { + "name": "Number of properties", + "file": "RF_002_1.rq", + "execute": true, + "weight": 0.2, + "result": "110", + "execution_time": 0.02, + "weighted_value": 22.0 + }, + "depth": { + "name": "Average class hierarchy depth", + "file": "RF_003.rq", + "execute": true, + "weight": 1, + "result": "3.34965034965035", + "execution_time": 0.03, + "weighted_value": 3.34965034965035 + }, + "width": { + "name": "Average class hierarchy width", + "file": "RF_004.rq", + "execute": true, + "weight": 1, + "result": "4.75", + "execution_time": 0.03, + "weighted_value": 4.75 + } + } + }, + "semantic_complexity": { + "name": "Semantic Complexity", + "files": { + "restrictions": { + "name": "Number of restrictions", + "file": "RF_005.rq", + "execute": true, + "weight": 0.2, + "result": "326", + "execution_time": 0.02, + "weighted_value": 65.2 + }, + "axioms": { + "name": "Number of logical axioms", + "file": "RF_006.rq", + "execute": true, + "weight": 1, + "result": "44", + "execution_time": 0.02, + "weighted_value": 44.0 + } + } + }, + "schematic_complexity": { + "name": "Schematic Complexity", + "files": { + "overall_complexity": { + "execute": false, + "name": "Overall Complexity", + "result": 221.29965034965034 + }, + "structural_complexity": { + "name": "Structural Complexity", + "execute": false, + "result": 112.09965034965035, + "weight": 1, + "weighted_value": 112.09965034965035 + }, + "semantic_complexity": { + "name": "Semantic Complexity", + "execute": false, + "result": 109.2, + "weight": 1, + "weighted_value": 109.2 + } + } + }, + "timestamp": "2025-03-19T14:41" +} \ No newline at end of file diff --git a/reports/metrics/general.json b/reports/metrics/general.json new file mode 100644 index 0000000000000000000000000000000000000000..1128352a3a4454cf6a0fb90f458aa5aef864c9a2 --- /dev/null +++ b/reports/metrics/general.json @@ -0,0 +1,96 @@ +{ + "instances": { + "name": "Instance Count", + "file": "GM001.rq", + "execute": true, + "result": "1252089", + "execution_time": 0.43 + }, + "assertions": { + "name": "Assertions Count", + "file": "GM002_1.rq", + "execute": true, + "result": "40786353", + "execution_time": 0.24 + }, + "linkage": { + "name": "Average Linkage Degree", + "file": "GM003_2.rq", + "execute": true, + "result": "3.24508957824795", + "execution_time": 23.04 + }, + "edges_out": { + "name": "Edges - outgoing", + "files": { + "total": { + "name": "Total number of outgoing edges", + "file": "GM004_2.rq", + "execute": true, + "result": "6705836", + "execution_time": 13.4 + }, + "min": { + "name": "Minimum of outgoing edges", + "file": "GM004_5.rq", + "execute": true, + "result": "1", + "execution_time": 23.76 + }, + "median": { + "name": "Median of outgoing edges", + "file": "GM004_4.rq", + "replace_dict": { + "{median_position}": "GM004_2.rq" + }, + "execute": true, + "result": "2", + "execution_time": 31.02 + }, + "max": { + "name": "Maximum of outgoing edges", + "file": "GM004_6.rq", + "execute": true, + "result": "282499", + "execution_time": 24.42 + } + } + }, + "edges_in": { + "name": "Edges - incoming", + "files": { + "total": { + "name": "Total number of incoming edges", + "file": "GM005_2.rq", + "execute": true, + "result": "15111836", + "execution_time": 25.74 + }, + "min": { + "name": "Minimum of incoming edges", + "file": "GM005_5.rq", + "execute": true, + "result": "1", + "execution_time": 21.76 + }, + "median": { + "name": "Median of incoming edges", + "file": "GM005_4.rq", + "replace_dict": { + "{median_position}": "GM005_2.rq" + }, + "execute": true, + "result": "1", + "execution_time": 36.21 + }, + "max": { + "name": "Maximum of incoming edges", + "file": "GM005_6.rq", + "execute": true, + "result": "1121975", + "execution_time": 22.61 + } + } + }, + "timestamp": "2025-03-18T10:45" +} \ No newline at end of file diff --git a/reports/metrics/resources.json b/reports/metrics/resources.json new file mode 100644 index 0000000000000000000000000000000000000000..3015d03388eb3a21dccdbc0ce8c50fc4d13944c8 --- /dev/null +++ b/reports/metrics/resources.json @@ -0,0 +1,1251 @@ +{ + "dataset": { + "instances": { + "name": "Number of Resources", + "file": "RM001_instances_template.rq", + "execute": true, + "result": "724903", + "execution_time": 0.42 + }, + "assertions": { + "name": "Number of Assertions", + "file": "RM002_assertions_template.rq", + "execute": true, + "result": "724904", + "execution_time": 0.28 + }, + "linkage": { + "name": "Average Linkage", + "file": "RM003_linkage_template.rq", + "execute": true, + "result": "68", + "execution_time": 0.22 + }, + "edges_out": { + "name": "Edges - outgoing", + "files": { + "total": { + "name": "Total number of outgoing edges", + "file": "RM004_1_out_edges_total_template.rq", + "execute": true, + "result": "724903", + "execution_time": 4.67 + }, + "min": { + "name": "Minimum of outgoing edges", + "file": "RM004_2_out_edges_min_template.rq", + "execute": true, + "result": "6", + "execution_time": 0.83 + }, + "median": { + "name": "Median of outgoing edges", + "file": "RM004_3_out_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM004_1_out_edges_total_template.rq" + }, + "execute": true, + "result": "23", + "execution_time": 1.5 + }, + "max": { + "name": "Maximum of outgoing edges", + "file": "RM004_4_out_edges_max_template.rq", + "execute": true, + "result": "12519", + "execution_time": 1.26 + } + } + }, + "edges_in": { + "name": "Edges - incoming", + "files": { + "total": { + "name": "Total number of incoming edges", + "file": "RM005_1_in_edges_total_template.rq", + "execute": true, + "result": "4", + "execution_time": 1.32 + }, + "min": { + "name": "Minimum of incoming edges", + "file": "RM005_2_in_edges_min_template.rq", + "execute": true, + "result": "1", + "execution_time": 0.22 + }, + "median": { + "name": "Median of incoming edges", + "file": "RM005_3_in_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM005_1_in_edges_total_template.rq" + }, + "execute": true, + "result": "1", + "execution_time": 0.22 + }, + "max": { + "name": "Maximum of incoming edges", + "file": "RM005_4_in_edges_max_template.rq", + "execute": true, + "result": "107", + "execution_time": 0.22 + } + } + }, + "connectivity": { + "name": "Connectivity", + "file": "RM006_connectivity_template.rq", + "execute": true, + "result": "427594", + "execution_time": 4.48 + }, + "timestamp": "2025-03-18T10:45", + "file": "dataset.md" + }, + "publication": { + "instances": { + "name": "Number of Resources", + "file": "RM001_instances_template.rq", + "execute": true, + "result": "5", + "execution_time": 0.22 + }, + "assertions": { + "name": "Number of Assertions", + "file": "RM002_assertions_template.rq", + "execute": true, + "result": "6", + "execution_time": 0.22 + }, + "linkage": { + "name": "Average Linkage", + "file": "RM003_linkage_template.rq", + "execute": true, + "result": 0, + "execution_time": 0.19 + }, + "edges_out": { + "name": "Edges - outgoing", + "files": { + "total": { + "name": "Total number of outgoing edges", + "file": "RM004_1_out_edges_total_template.rq", + "execute": true, + "result": "5", + "execution_time": 0.19 + }, + "min": { + "name": "Minimum of outgoing edges", + "file": "RM004_2_out_edges_min_template.rq", + "execute": true, + "result": "7", + "execution_time": 0.22 + }, + "median": { + "name": "Median of outgoing edges", + "file": "RM004_3_out_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM004_1_out_edges_total_template.rq" + }, + "execute": true, + "result": "9", + "execution_time": 0.2 + }, + "max": { + "name": "Maximum of outgoing edges", + "file": "RM004_4_out_edges_max_template.rq", + "execute": true, + "result": "11", + "execution_time": 0.22 + } + } + }, + "edges_in": { + "name": "Edges - incoming", + "files": { + "total": { + "name": "Total number of incoming edges", + "file": "RM005_1_in_edges_total_template.rq", + "execute": true, + "result": "0", + "execution_time": 0.22 + }, + "min": { + "name": "Minimum of incoming edges", + "file": "RM005_2_in_edges_min_template.rq", + "execute": true, + "result": null, + "execution_time": 0.22 + }, + "median": { + "name": "Median of incoming edges", + "file": "RM005_3_in_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM005_1_in_edges_total_template.rq" + }, + "execute": true, + "result": null, + "execution_time": 0.2 + }, + "max": { + "name": "Maximum of incoming edges", + "file": "RM005_4_in_edges_max_template.rq", + "execute": true, + "result": null, + "execution_time": 0.22 + } + } + }, + "connectivity": { + "name": "Connectivity", + "file": "RM006_connectivity_template.rq", + "execute": true, + "result": "15", + "execution_time": 0.22 + }, + "timestamp": "2025-03-18T10:45", + "file": "publication.md" + }, + "learning_resource": { + "instances": { + "name": "Number of Resources", + "file": "RM001_instances_template.rq", + "execute": true, + "result": "512", + "execution_time": 0.21 + }, + "assertions": { + "name": "Number of Assertions", + "file": "RM002_assertions_template.rq", + "execute": true, + "result": "513", + "execution_time": 0.19 + }, + "linkage": { + "name": "Average Linkage", + "file": "RM003_linkage_template.rq", + "execute": true, + "result": "25", + "execution_time": 0.2 + }, + "edges_out": { + "name": "Edges - outgoing", + "files": { + "total": { + "name": "Total number of outgoing edges", + "file": "RM004_1_out_edges_total_template.rq", + "execute": true, + "result": "512", + "execution_time": 0.19 + }, + "min": { + "name": "Minimum of outgoing edges", + "file": "RM004_2_out_edges_min_template.rq", + "execute": true, + "result": "10", + "execution_time": 0.22 + }, + "median": { + "name": "Median of outgoing edges", + "file": "RM004_3_out_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM004_1_out_edges_total_template.rq" + }, + "execute": true, + "result": "15", + "execution_time": 0.21 + }, + "max": { + "name": "Maximum of outgoing edges", + "file": "RM004_4_out_edges_max_template.rq", + "execute": true, + "result": "27", + "execution_time": 0.2 + } + } + }, + "edges_in": { + "name": "Edges - incoming", + "files": { + "total": { + "name": "Total number of incoming edges", + "file": "RM005_1_in_edges_total_template.rq", + "execute": true, + "result": "1", + "execution_time": 0.21 + }, + "min": { + "name": "Minimum of incoming edges", + "file": "RM005_2_in_edges_min_template.rq", + "execute": true, + "result": "3", + "execution_time": 0.2 + }, + "median": { + "name": "Median of incoming edges", + "file": "RM005_3_in_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM005_1_in_edges_total_template.rq" + }, + "execute": true, + "result": "3", + "execution_time": 0.1 + }, + "max": { + "name": "Maximum of incoming edges", + "file": "RM005_4_in_edges_max_template.rq", + "execute": true, + "result": "3", + "execution_time": 0.19 + } + } + }, + "connectivity": { + "name": "Connectivity", + "file": "RM006_connectivity_template.rq", + "execute": true, + "result": "530", + "execution_time": 0.19 + }, + "timestamp": "2025-03-18T10:46", + "file": "learning_resource.md" + }, + "repository": { + "instances": { + "name": "Number of Resources", + "file": "RM001_instances_template.rq", + "execute": true, + "result": "159", + "execution_time": 0.19 + }, + "assertions": { + "name": "Number of Assertions", + "file": "RM002_assertions_template.rq", + "execute": true, + "result": "162", + "execution_time": 0.19 + }, + "linkage": { + "name": "Average Linkage", + "file": "RM003_linkage_template.rq", + "execute": true, + "result": "90212", + "execution_time": 0.22 + }, + "edges_out": { + "name": "Edges - outgoing", + "files": { + "total": { + "name": "Total number of outgoing edges", + "file": "RM004_1_out_edges_total_template.rq", + "execute": true, + "result": "159", + "execution_time": 0.1 + }, + "min": { + "name": "Minimum of outgoing edges", + "file": "RM004_2_out_edges_min_template.rq", + "execute": true, + "result": "7", + "execution_time": 0.17 + }, + "median": { + "name": "Median of outgoing edges", + "file": "RM004_3_out_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM004_1_out_edges_total_template.rq" + }, + "execute": true, + "result": "44", + "execution_time": 0.21 + }, + "max": { + "name": "Maximum of outgoing edges", + "file": "RM004_4_out_edges_max_template.rq", + "execute": true, + "result": "92", + "execution_time": 0.22 + } + } + }, + "edges_in": { + "name": "Edges - incoming", + "files": { + "total": { + "name": "Total number of incoming edges", + "file": "RM005_1_in_edges_total_template.rq", + "execute": true, + "result": "5", + "execution_time": 0.22 + }, + "min": { + "name": "Minimum of incoming edges", + "file": "RM005_2_in_edges_min_template.rq", + "execute": true, + "result": "1417", + "execution_time": 0.22 + }, + "median": { + "name": "Median of incoming edges", + "file": "RM005_3_in_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM005_1_in_edges_total_template.rq" + }, + "execute": true, + "result": "6718", + "execution_time": 0.22 + }, + "max": { + "name": "Maximum of incoming edges", + "file": "RM005_4_in_edges_max_template.rq", + "execute": true, + "result": "432941", + "execution_time": 0.22 + } + } + }, + "connectivity": { + "name": "Connectivity", + "file": "RM006_connectivity_template.rq", + "execute": true, + "result": "825", + "execution_time": 0.2 + }, + "timestamp": "2025-03-18T10:46", + "file": "repository.md" + }, + "article_lhb": { + "instances": { + "name": "Number of Resources", + "file": "RM001_instances_template.rq", + "execute": true, + "result": "114", + "execution_time": 0.22 + }, + "assertions": { + "name": "Number of Assertions", + "file": "RM002_assertions_template.rq", + "execute": true, + "result": "117", + "execution_time": 0.21 + }, + "linkage": { + "name": "Average Linkage", + "file": "RM003_linkage_template.rq", + "execute": true, + "result": "33.245098039215686", + "execution_time": 0.21 + }, + "edges_out": { + "name": "Edges - outgoing", + "files": { + "total": { + "name": "Total number of outgoing edges", + "file": "RM004_1_out_edges_total_template.rq", + "execute": true, + "result": "114", + "execution_time": 0.21 + }, + "min": { + "name": "Minimum of outgoing edges", + "file": "RM004_2_out_edges_min_template.rq", + "execute": true, + "result": "11", + "execution_time": 0.2 + }, + "median": { + "name": "Median of outgoing edges", + "file": "RM004_3_out_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM004_1_out_edges_total_template.rq" + }, + "execute": true, + "result": "30", + "execution_time": 0.19 + }, + "max": { + "name": "Maximum of outgoing edges", + "file": "RM004_4_out_edges_max_template.rq", + "execute": true, + "result": "62", + "execution_time": 0.22 + } + } + }, + "edges_in": { + "name": "Edges - incoming", + "files": { + "total": { + "name": "Total number of incoming edges", + "file": "RM005_1_in_edges_total_template.rq", + "execute": true, + "result": "102", + "execution_time": 0.14 + }, + "min": { + "name": "Minimum of incoming edges", + "file": "RM005_2_in_edges_min_template.rq", + "execute": true, + "result": "1", + "execution_time": 0.15 + }, + "median": { + "name": "Median of incoming edges", + "file": "RM005_3_in_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM005_1_in_edges_total_template.rq" + }, + "execute": true, + "result": "2", + "execution_time": 0.2 + }, + "max": { + "name": "Maximum of incoming edges", + "file": "RM005_4_in_edges_max_template.rq", + "execute": true, + "result": "24", + "execution_time": 0.22 + } + } + }, + "connectivity": { + "name": "Connectivity", + "file": "RM006_connectivity_template.rq", + "execute": true, + "result": "592", + "execution_time": 0.22 + }, + "timestamp": "2025-03-18T10:46", + "file": "article_lhb.md" + }, + "standards": { + "instances": { + "name": "Number of Resources", + "file": "RM001_instances_template.rq", + "execute": true, + "result": "89", + "execution_time": 0.22 + }, + "assertions": { + "name": "Number of Assertions", + "file": "RM002_assertions_template.rq", + "execute": true, + "result": "101", + "execution_time": 0.13 + }, + "linkage": { + "name": "Average Linkage", + "file": "RM003_linkage_template.rq", + "execute": true, + "result": "13.714285714285714", + "execution_time": 0.15 + }, + "edges_out": { + "name": "Edges - outgoing", + "files": { + "total": { + "name": "Total number of outgoing edges", + "file": "RM004_1_out_edges_total_template.rq", + "execute": true, + "result": "89", + "execution_time": 0.21 + }, + "min": { + "name": "Minimum of outgoing edges", + "file": "RM004_2_out_edges_min_template.rq", + "execute": true, + "result": "5", + "execution_time": 0.19 + }, + "median": { + "name": "Median of outgoing edges", + "file": "RM004_3_out_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM004_1_out_edges_total_template.rq" + }, + "execute": true, + "result": "9", + "execution_time": 0.2 + }, + "max": { + "name": "Maximum of outgoing edges", + "file": "RM004_4_out_edges_max_template.rq", + "execute": true, + "result": "27", + "execution_time": 0.1 + } + } + }, + "edges_in": { + "name": "Edges - incoming", + "files": { + "total": { + "name": "Total number of incoming edges", + "file": "RM005_1_in_edges_total_template.rq", + "execute": true, + "result": "77", + "execution_time": 0.19 + }, + "min": { + "name": "Minimum of incoming edges", + "file": "RM005_2_in_edges_min_template.rq", + "execute": true, + "result": "1", + "execution_time": 0.2 + }, + "median": { + "name": "Median of incoming edges", + "file": "RM005_3_in_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM005_1_in_edges_total_template.rq" + }, + "execute": true, + "result": "1", + "execution_time": 0.19 + }, + "max": { + "name": "Maximum of incoming edges", + "file": "RM005_4_in_edges_max_template.rq", + "execute": true, + "result": "56", + "execution_time": 0.22 + } + } + }, + "connectivity": { + "name": "Connectivity", + "file": "RM006_connectivity_template.rq", + "execute": true, + "result": "100", + "execution_time": 1.2 + }, + "timestamp": "2025-03-18T10:46", + "file": "standards.md" + }, + "software": { + "instances": { + "name": "Number of Resources", + "file": "RM001_instances_template.rq", + "execute": true, + "result": "147", + "execution_time": 0.2 + }, + "assertions": { + "name": "Number of Assertions", + "file": "RM002_assertions_template.rq", + "execute": true, + "result": "148", + "execution_time": 0.22 + }, + "linkage": { + "name": "Average Linkage", + "file": "RM003_linkage_template.rq", + "execute": true, + "result": "11.5", + "execution_time": 0.23 + }, + "edges_out": { + "name": "Edges - outgoing", + "files": { + "total": { + "name": "Total number of outgoing edges", + "file": "RM004_1_out_edges_total_template.rq", + "execute": true, + "result": "147", + "execution_time": 0.21 + }, + "min": { + "name": "Minimum of outgoing edges", + "file": "RM004_2_out_edges_min_template.rq", + "execute": true, + "result": "7", + "execution_time": 0.17 + }, + "median": { + "name": "Median of outgoing edges", + "file": "RM004_3_out_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM004_1_out_edges_total_template.rq" + }, + "execute": true, + "result": "20", + "execution_time": 0.22 + }, + "max": { + "name": "Maximum of outgoing edges", + "file": "RM004_4_out_edges_max_template.rq", + "execute": true, + "result": "122", + "execution_time": 0.22 + } + } + }, + "edges_in": { + "name": "Edges - incoming", + "files": { + "total": { + "name": "Total number of incoming edges", + "file": "RM005_1_in_edges_total_template.rq", + "execute": true, + "result": "8", + "execution_time": 0.22 + }, + "min": { + "name": "Minimum of incoming edges", + "file": "RM005_2_in_edges_min_template.rq", + "execute": true, + "result": "1", + "execution_time": 0.1 + }, + "median": { + "name": "Median of incoming edges", + "file": "RM005_3_in_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM005_1_in_edges_total_template.rq" + }, + "execute": true, + "result": "1", + "execution_time": 0.17 + }, + "max": { + "name": "Maximum of incoming edges", + "file": "RM005_4_in_edges_max_template.rq", + "execute": true, + "result": "2", + "execution_time": 0.19 + } + } + }, + "connectivity": { + "name": "Connectivity", + "file": "RM006_connectivity_template.rq", + "execute": true, + "result": "1103", + "execution_time": 0.22 + }, + "timestamp": "2025-03-18T10:46", + "file": "software.md" + }, + "service": { + "instances": { + "name": "Number of Resources", + "file": "RM001_instances_template.rq", + "execute": true, + "result": "1", + "execution_time": 0.24 + }, + "assertions": { + "name": "Number of Assertions", + "file": "RM002_assertions_template.rq", + "execute": true, + "result": "1", + "execution_time": 0.19 + }, + "linkage": { + "name": "Average Linkage", + "file": "RM003_linkage_template.rq", + "execute": true, + "result": 0, + "execution_time": 0.22 + }, + "edges_out": { + "name": "Edges - outgoing", + "files": { + "total": { + "name": "Total number of outgoing edges", + "file": "RM004_1_out_edges_total_template.rq", + "execute": true, + "result": "1", + "execution_time": 0.1 + }, + "min": { + "name": "Minimum of outgoing edges", + "file": "RM004_2_out_edges_min_template.rq", + "execute": true, + "result": "14", + "execution_time": 0.2 + }, + "median": { + "name": "Median of outgoing edges", + "file": "RM004_3_out_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM004_1_out_edges_total_template.rq" + }, + "execute": true, + "result": "14", + "execution_time": 0.19 + }, + "max": { + "name": "Maximum of outgoing edges", + "file": "RM004_4_out_edges_max_template.rq", + "execute": true, + "result": "14", + "execution_time": 0.16 + } + } + }, + "edges_in": { + "name": "Edges - incoming", + "files": { + "total": { + "name": "Total number of incoming edges", + "file": "RM005_1_in_edges_total_template.rq", + "execute": true, + "result": "0", + "execution_time": 0.12 + }, + "min": { + "name": "Minimum of incoming edges", + "file": "RM005_2_in_edges_min_template.rq", + "execute": true, + "result": null, + "execution_time": 0.22 + }, + "median": { + "name": "Median of incoming edges", + "file": "RM005_3_in_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM005_1_in_edges_total_template.rq" + }, + "execute": true, + "result": null, + "execution_time": 0.2 + }, + "max": { + "name": "Maximum of incoming edges", + "file": "RM005_4_in_edges_max_template.rq", + "execute": true, + "result": null, + "execution_time": 0.19 + } + } + }, + "connectivity": { + "name": "Connectivity", + "file": "RM006_connectivity_template.rq", + "execute": true, + "result": "13", + "execution_time": 0.19 + }, + "timestamp": "2025-03-18T10:46", + "file": "service.md" + }, + "data_service": { + "instances": { + "name": "Number of Resources", + "file": "RM001_instances_template.rq", + "execute": true, + "result": "363632", + "execution_time": 0.25 + }, + "assertions": { + "name": "Number of Assertions", + "file": "RM002_assertions_template.rq", + "execute": true, + "result": "363633", + "execution_time": 0.2 + }, + "linkage": { + "name": "Average Linkage", + "file": "RM003_linkage_template.rq", + "execute": true, + "result": 0, + "execution_time": 1.2 + }, + "edges_out": { + "name": "Edges - outgoing", + "files": { + "total": { + "name": "Total number of outgoing edges", + "file": "RM004_1_out_edges_total_template.rq", + "execute": true, + "result": "363632", + "execution_time": 1.94 + }, + "min": { + "name": "Minimum of outgoing edges", + "file": "RM004_2_out_edges_min_template.rq", + "execute": true, + "result": "11", + "execution_time": 0.6 + }, + "median": { + "name": "Median of outgoing edges", + "file": "RM004_3_out_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM004_1_out_edges_total_template.rq" + }, + "execute": true, + "result": "28", + "execution_time": 0.99 + }, + "max": { + "name": "Maximum of outgoing edges", + "file": "RM004_4_out_edges_max_template.rq", + "execute": true, + "result": "12519", + "execution_time": 0.64 + } + } + }, + "edges_in": { + "name": "Edges - incoming", + "files": { + "total": { + "name": "Total number of incoming edges", + "file": "RM005_1_in_edges_total_template.rq", + "execute": true, + "result": "0", + "execution_time": 0.24 + }, + "min": { + "name": "Minimum of incoming edges", + "file": "RM005_2_in_edges_min_template.rq", + "execute": true, + "result": null, + "execution_time": 0.22 + }, + "median": { + "name": "Median of incoming edges", + "file": "RM005_3_in_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM005_1_in_edges_total_template.rq" + }, + "execute": true, + "result": null, + "execution_time": 0.19 + }, + "max": { + "name": "Maximum of incoming edges", + "file": "RM005_4_in_edges_max_template.rq", + "execute": true, + "result": null, + "execution_time": 0.22 + } + } + }, + "connectivity": { + "name": "Connectivity", + "file": "RM006_connectivity_template.rq", + "execute": true, + "result": "58909", + "execution_time": 2.42 + }, + "timestamp": "2025-03-18T10:46", + "file": "data_service.md" + }, + "aggregator": { + "instances": { + "name": "Number of Resources", + "file": "RM001_instances_template.rq", + "execute": true, + "result": "38", + "execution_time": 0.1 + }, + "assertions": { + "name": "Number of Assertions", + "file": "RM002_assertions_template.rq", + "execute": true, + "result": "39", + "execution_time": 0.12 + }, + "linkage": { + "name": "Average Linkage", + "file": "RM003_linkage_template.rq", + "execute": true, + "result": "187", + "execution_time": 0.2 + }, + "edges_out": { + "name": "Edges - outgoing", + "files": { + "total": { + "name": "Total number of outgoing edges", + "file": "RM004_1_out_edges_total_template.rq", + "execute": true, + "result": "38", + "execution_time": 0.21 + }, + "min": { + "name": "Minimum of outgoing edges", + "file": "RM004_2_out_edges_min_template.rq", + "execute": true, + "result": "8", + "execution_time": 0.16 + }, + "median": { + "name": "Median of outgoing edges", + "file": "RM004_3_out_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM004_1_out_edges_total_template.rq" + }, + "execute": true, + "result": "36", + "execution_time": 0.22 + }, + "max": { + "name": "Maximum of outgoing edges", + "file": "RM004_4_out_edges_max_template.rq", + "execute": true, + "result": "57", + "execution_time": 0.22 + } + } + }, + "edges_in": { + "name": "Edges - incoming", + "files": { + "total": { + "name": "Total number of incoming edges", + "file": "RM005_1_in_edges_total_template.rq", + "execute": true, + "result": "1", + "execution_time": 0.22 + }, + "min": { + "name": "Minimum of incoming edges", + "file": "RM005_2_in_edges_min_template.rq", + "execute": true, + "result": "151", + "execution_time": 0.1 + }, + "median": { + "name": "Median of incoming edges", + "file": "RM005_3_in_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM005_1_in_edges_total_template.rq" + }, + "execute": true, + "result": "151", + "execution_time": 0.2 + }, + "max": { + "name": "Maximum of incoming edges", + "file": "RM005_4_in_edges_max_template.rq", + "execute": true, + "result": "151", + "execution_time": 0.21 + } + } + }, + "connectivity": { + "name": "Connectivity", + "file": "RM006_connectivity_template.rq", + "execute": true, + "result": "239", + "execution_time": 0.22 + }, + "timestamp": "2025-03-18T10:46", + "file": "aggregator.md" + }, + "person": { + "instances": { + "name": "Number of Resources", + "file": "RM001_instances_template.rq", + "execute": true, + "result": "2180", + "execution_time": 0.22 + }, + "assertions": { + "name": "Number of Assertions", + "file": "RM002_assertions_template.rq", + "execute": true, + "result": "2181", + "execution_time": 0.22 + }, + "linkage": { + "name": "Average Linkage", + "file": "RM003_linkage_template.rq", + "execute": true, + "result": "4.628269848554383", + "execution_time": 0.2 + }, + "edges_out": { + "name": "Edges - outgoing", + "files": { + "total": { + "name": "Total number of outgoing edges", + "file": "RM004_1_out_edges_total_template.rq", + "execute": true, + "result": "2180", + "execution_time": 0.21 + }, + "min": { + "name": "Minimum of outgoing edges", + "file": "RM004_2_out_edges_min_template.rq", + "execute": true, + "result": "2", + "execution_time": 0.14 + }, + "median": { + "name": "Median of outgoing edges", + "file": "RM004_3_out_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM004_1_out_edges_total_template.rq" + }, + "execute": true, + "result": "4", + "execution_time": 0.23 + }, + "max": { + "name": "Maximum of outgoing edges", + "file": "RM004_4_out_edges_max_template.rq", + "execute": true, + "result": "7", + "execution_time": 0.21 + } + } + }, + "edges_in": { + "name": "Edges - incoming", + "files": { + "total": { + "name": "Total number of incoming edges", + "file": "RM005_1_in_edges_total_template.rq", + "execute": true, + "result": "2179", + "execution_time": 0.19 + }, + "min": { + "name": "Minimum of incoming edges", + "file": "RM005_2_in_edges_min_template.rq", + "execute": true, + "result": "1", + "execution_time": 0.19 + }, + "median": { + "name": "Median of incoming edges", + "file": "RM005_3_in_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM005_1_in_edges_total_template.rq" + }, + "execute": true, + "result": "1", + "execution_time": 0.18 + }, + "max": { + "name": "Maximum of incoming edges", + "file": "RM005_4_in_edges_max_template.rq", + "execute": true, + "result": "2", + "execution_time": 0.19 + } + } + }, + "connectivity": { + "name": "Connectivity", + "file": "RM006_connectivity_template.rq", + "execute": true, + "result": "2", + "execution_time": 0.22 + }, + "timestamp": "2025-03-18T10:46", + "file": "person.md" + }, + "registry": { + "instances": { + "name": "Number of Resources", + "file": "RM001_instances_template.rq", + "execute": true, + "result": "5", + "execution_time": 0.21 + }, + "assertions": { + "name": "Number of Assertions", + "file": "RM002_assertions_template.rq", + "execute": true, + "result": "6", + "execution_time": 0.2 + }, + "linkage": { + "name": "Average Linkage", + "file": "RM003_linkage_template.rq", + "execute": true, + "result": 0, + "execution_time": 0.22 + }, + "edges_out": { + "name": "Edges - outgoing", + "files": { + "total": { + "name": "Total number of outgoing edges", + "file": "RM004_1_out_edges_total_template.rq", + "execute": true, + "result": "5", + "execution_time": 0.1 + }, + "min": { + "name": "Minimum of outgoing edges", + "file": "RM004_2_out_edges_min_template.rq", + "execute": true, + "result": "7", + "execution_time": 0.19 + }, + "median": { + "name": "Median of outgoing edges", + "file": "RM004_3_out_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM004_1_out_edges_total_template.rq" + }, + "execute": true, + "result": "9", + "execution_time": 0.19 + }, + "max": { + "name": "Maximum of outgoing edges", + "file": "RM004_4_out_edges_max_template.rq", + "execute": true, + "result": "11", + "execution_time": 0.16 + } + } + }, + "edges_in": { + "name": "Edges - incoming", + "files": { + "total": { + "name": "Total number of incoming edges", + "file": "RM005_1_in_edges_total_template.rq", + "execute": true, + "result": "0", + "execution_time": 0.13 + }, + "min": { + "name": "Minimum of incoming edges", + "file": "RM005_2_in_edges_min_template.rq", + "execute": true, + "result": null, + "execution_time": 0.2 + }, + "median": { + "name": "Median of incoming edges", + "file": "RM005_3_in_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM005_1_in_edges_total_template.rq" + }, + "execute": true, + "result": null, + "execution_time": 0.2 + }, + "max": { + "name": "Maximum of incoming edges", + "file": "RM005_4_in_edges_max_template.rq", + "execute": true, + "result": null, + "execution_time": 0.22 + } + } + }, + "connectivity": { + "name": "Connectivity", + "file": "RM006_connectivity_template.rq", + "execute": true, + "result": "15", + "execution_time": 0.22 + }, + "timestamp": "2025-03-18T10:46", + "file": "registry.md" + }, + "timestamp": "2025-03-18T10:46" +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7e9d53dc8c973199f405a4e9201d360eddae035 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +mkdocs +mkdocs-material +mkdocs-macros-plugin +mkdocs-awesome-nav diff --git a/scripts/.flake8 b/scripts/.flake8 new file mode 100644 index 0000000000000000000000000000000000000000..bfead2ca040c32ab6b088cbe7534a804075720f9 --- /dev/null +++ b/scripts/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 79 diff --git a/scripts/.gitignore b/scripts/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..154d56a5bd32255a8db74b795a7d5110c731a84a --- /dev/null +++ b/scripts/.gitignore @@ -0,0 +1,6 @@ +**/__pycache__ +**/dist + +*.egg-info +*.pyc +*.ini diff --git a/scripts/LICENSE b/scripts/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..d5777251382417d9b6a24495ea6cc84766d43259 --- /dev/null +++ b/scripts/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2025 NFDI4Earth / SoftwareAndArchitecture + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/scripts/MANIFEST.in b/scripts/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000000000000000000000000000000000000..650e60a8c260dec9df2cec0f4987b38c260f4f85 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,60 @@ +# KG Analysis Tool + +A command-line tool for analyzing SPARQL endpoints, specifically designed for the NFDI4Earth Knowledge Graph and its metrics collection. + +## Features + +- Run SPARQL queries from files +- Execute predefined metric queries +- Save query results in JSON +- Configurable SPARQL endpoint, request timeout & directory of reports via environment variables +- Supports query parameters and templates + +## Installation + +```bash +# Create and activate virtual environment +python3 -m venv venv +source venv/bin/activate + +# Install in development mode +pip install -e . +``` + +## Usage + +### Environment Variables + +```bash +# Required +export SPARQL_ENDPOINT="https://sparql.knowledgehub.nfdi4earth.de" + +# Optional +export SPARQL_TIMEOUT=120 # in seconds +export REPORTS_DIR="./reports/metrics" +``` + +### CLI Commands + +Run a single query: + +```bash +kg-analysis query -q path/to/query.rq +``` + +Request/Calculate all metrics: + +```bash +kg-analysis metrics +``` + +## License + +This project is published under the Apache License 2.0, see file [`LICENSE`](LICENSE). + +## Related Projects + +- [NFDI4Earth KnowledgeHub](https://knowledgehub.nfdi4earth.de) +- [OneStop4All](https://onestop4all.nfdi4earth.de) + +Contributors: Ralf Klammer diff --git a/scripts/kg_analysis/__init__.py b/scripts/kg_analysis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..231e26c4d0fe3678d3ae1410c2e6c1a5c06b0853 --- /dev/null +++ b/scripts/kg_analysis/__init__.py @@ -0,0 +1,15 @@ +import os + +from pathlib import Path +from rich.console import Console # type: ignore +from rich import inspect # type: ignore +from rich import print as rprint # type: ignore + +console = Console() + +SPARQL_ENDPOINT = os.getenv( + "SPARQL_ENDPOINT", "https://sparql.knowledgehub.nfdi4earth.de" +) +SPARQL_TIMEOUT = int(os.getenv("SPARQL_TIMEOUT", 120)) + +REPORTS_DIR = Path(os.getenv("REPORTS_DIR", "./reports/metrics")) diff --git a/scripts/kg_analysis/cli.py b/scripts/kg_analysis/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..f6a1de77359cc28a8a25205f39a0f58f64ea1fdc --- /dev/null +++ b/scripts/kg_analysis/cli.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2025 TU-Dresden, ZIH +# ralf.klammer@tu-dresden.de +import logging + +import click + +from pathlib import Path +from rich import print_json + +from . import REPORTS_DIR + +from .util import cli_startup +from .query_runner import QueryRunner # type: ignore +from .metrics_runner import MetricsRunner # type: ignore + + +log = logging.getLogger(__name__) + + +@click.group() +@click.option("--debug/--no-debug", "-d", is_flag=True, default=False) +@click.option("--output", "-o", type=click.Path(), help="Path to save results") +@click.pass_context +def main(ctx, debug, output): + cli_startup(log_level=debug and logging.DEBUG or logging.INFO) + ctx.ensure_object(dict) + ctx.obj["DEBUG"] = debug + ctx.obj["output"] = output + + +@main.command() +@click.option( + "--query", + "-q", + type=click.Path(exists=True), + help="Path to SPARQL query file", + required=True, +) +@click.pass_context +def query(ctx, query): + """Run analysis on the KnowledgeGraph.""" + runner = QueryRunner() + + query_path = Path(query) + try: + output_path = ( + Path(ctx.obj["output"]) + if ctx.obj["output"] + else REPORTS_DIR + / "queries" + / query_path.name.replace(".rq", ".json") + ) + output_path.parent.mkdir(parents=True, exist_ok=True) + + results = runner.run_metric(query_path, output_path=output_path) + + click.echo( + click.style("Query executed successfully: ", fg="green") + + click.style(query_path.name, fg="blue") + ) + if not ctx.obj["output"]: + print_json(data=results) + except Exception as e: + click.echo( + click.style(f"Error executing query: {e}", fg="red"), err=True + ) + + +@main.command() +@click.pass_context +def metrics(ctx): + """Run all metrics and save reports.""" + output_path = Path(ctx.obj["output"]) if ctx.obj["output"] else REPORTS_DIR + output_path.mkdir(parents=True, exist_ok=True) + MetricsRunner(output_dir=output_path).run() + + +if __name__ == "__main__": + main(obj={}) diff --git a/scripts/kg_analysis/interfaces/__init__.py b/scripts/kg_analysis/interfaces/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c223999b2183c013b3c0156b9b6a4580d16cc485 --- /dev/null +++ b/scripts/kg_analysis/interfaces/__init__.py @@ -0,0 +1,4 @@ +from .complexity import query_templates as complexity_query_templates +from .general import query_templates as general_query_templates +from .resources import query_templates as resource_query_templates +from .resources import resource_types diff --git a/scripts/kg_analysis/interfaces/complexity.py b/scripts/kg_analysis/interfaces/complexity.py new file mode 100644 index 0000000000000000000000000000000000000000..2d98193b4ce814f2b20b05d8ae3dd5ad0f97e20e --- /dev/null +++ b/scripts/kg_analysis/interfaces/complexity.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2025 TU-Dresden, ZIH +# ralf.klammer@tu-dresden.de +import logging + +log = logging.getLogger(__name__) + +query_templates = { + "structural_complexity": { + "name": "Structural Complexity", + "files": { + "classes": { + "name": "Number of Classes", + "file": "RF_001.rq", + "execute": True, + "weight": 0.2, + }, + "properties": { + "name": "Number of properties", + "file": "RF_002_1.rq", + "execute": True, + "weight": 0.2, + }, + "depth": { + "name": "Average class hierarchy depth", + "file": "RF_003.rq", + "execute": True, + "weight": 1, + }, + "width": { + "name": "Average class hierarchy width", + "file": "RF_004.rq", + "execute": True, + "weight": 1, + }, + }, + }, + "semantic_complexity": { + "name": "Semantic Complexity", + "files": { + "restrictions": { + "name": "Number of restrictions", + "file": "RF_005.rq", + "execute": True, + "weight": 0.2, + }, + "axioms": { + "name": "Number of logical axioms", + "file": "RF_006.rq", + "execute": True, + "weight": 1, + }, + }, + }, + "schematic_complexity": { + "name": "Schematic Complexity", + "files": { + "overall_complexity": { + "execute": False, + "name": "Overall Complexity", + "result": 0, + }, + "structural_complexity": { + "name": "Structural Complexity", + "execute": False, + "result": 0, + "weight": 1, + }, + "semantic_complexity": { + "name": "Semantic Complexity", + "execute": False, + "result": 0, + "weight": 1, + }, + }, + }, +} diff --git a/scripts/kg_analysis/interfaces/general.py b/scripts/kg_analysis/interfaces/general.py new file mode 100644 index 0000000000000000000000000000000000000000..0db7ca32ddab08da7e3bff1c978353d4dac6da26 --- /dev/null +++ b/scripts/kg_analysis/interfaces/general.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2025 TU-Dresden, ZIH +# ralf.klammer@tu-dresden.de +import logging + +log = logging.getLogger(__name__) + +query_templates = { + "instances": { + "name": "Instance Count", + "file": "GM001.rq", + "execute": True, + }, + "assertions": { + "name": "Assertions Count", + "file": "GM002_1.rq", + "execute": True, + }, + "linkage": { + "name": "Average Linkage Degree", + "file": "GM003_2.rq", + "execute": True, + }, + "edges_out": { + "name": "Edges - outgoing", + "files": { + "total": { + "name": "Total number of outgoing edges", + "file": "GM004_2.rq", + "execute": True, + }, + "min": { + "name": "Minimum of outgoing edges", + "file": "GM004_5.rq", + "execute": True, + }, + "median": { + "name": "Median of outgoing edges", + "file": "GM004_4.rq", + "replace_dict": {"{median_position}": "GM004_2.rq"}, + "execute": True, + }, + "max": { + "name": "Maximum of outgoing edges", + "file": "GM004_6.rq", + "execute": True, + }, + }, + }, + "edges_in": { + "name": "Edges - incoming", + "files": { + "total": { + "name": "Total number of incoming edges", + "file": "GM005_2.rq", + "execute": True, + }, + "min": { + "name": "Minimum of incoming edges", + "file": "GM005_5.rq", + "execute": True, + }, + "median": { + "name": "Median of incoming edges", + "file": "GM005_4.rq", + "replace_dict": {"{median_position}": "GM005_2.rq"}, + "execute": True, + }, + "max": { + "name": "Maximum of incoming edges", + "file": "GM005_6.rq", + "execute": True, + }, + }, + }, +} diff --git a/scripts/kg_analysis/interfaces/resources.py b/scripts/kg_analysis/interfaces/resources.py new file mode 100644 index 0000000000000000000000000000000000000000..302ad9cdf152f3950aa001cd03ee992d3698b511 --- /dev/null +++ b/scripts/kg_analysis/interfaces/resources.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2025 TU-Dresden, ZIH +# ralf.klammer@tu-dresden.de +import logging + +log = logging.getLogger(__name__) + +query_templates = { + "instances": { + "name": "Number of Resources", + "file": "RM001_instances_template.rq", + "execute": True, + }, + "assertions": { + "name": "Number of Assertions", + "file": "RM002_assertions_template.rq", + "execute": True, + }, + "linkage": { + "name": "Average Linkage", + "file": "RM003_linkage_template.rq", + "execute": True, + }, + "edges_out": { + "name": "Edges - outgoing", + "files": { + "total": { + "name": "Total number of outgoing edges", + "file": "RM004_1_out_edges_total_template.rq", + "execute": True, + }, + "min": { + "name": "Minimum of outgoing edges", + "file": "RM004_2_out_edges_min_template.rq", + "execute": True, + }, + "median": { + "name": "Median of outgoing edges", + "file": "RM004_3_out_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM004_1_out_edges_total_template.rq" + }, + "execute": True, + }, + "max": { + "name": "Maximum of outgoing edges", + "file": "RM004_4_out_edges_max_template.rq", + "execute": True, + }, + }, + }, + "edges_in": { + "name": "Edges - incoming", + "files": { + "total": { + "name": "Total number of incoming edges", + "file": "RM005_1_in_edges_total_template.rq", + "execute": True, + }, + "min": { + "name": "Minimum of incoming edges", + "file": "RM005_2_in_edges_min_template.rq", + "execute": True, + }, + "median": { + "name": "Median of incoming edges", + "file": "RM005_3_in_edges_median_template.rq", + "replace_dict": { + "{median_position}": "RM005_1_in_edges_total_template.rq" + }, + "execute": True, + }, + "max": { + "name": "Maximum of incoming edges", + "file": "RM005_4_in_edges_max_template.rq", + "execute": True, + }, + }, + }, + "connectivity": { + "name": "Connectivity", + "file": "RM006_connectivity_template.rq", + "execute": True, + }, +} + +resource_types = { + "dataset": { + "file": "dataset.md", + "uri": "<http://www.w3.org/ns/dcat#Dataset>", + }, + "publication": { + "file": "publication.md", + "uri": "<http://nfdi4earth.de/ontology/Registry>", + }, + "learning_resource": { + "file": "learning_resource.md", + "uri": "<http://schema.org/LearningResource>", + }, + "repository": { + "file": "repository.md", + "uri": "<http://nfdi4earth.de/ontology/Repository>", + }, + "article_lhb": { + "file": "article_lhb.md", + "uri": "<http://nfdi4earth.de/ontology/LHBArticle>", + }, + "standards": { + "file": "standards.md", + "uri": "<http://nfdi4earth.de/ontology/MetadataStandard>", + }, + # "organization": { + # "file": "organization.md", + # "uri": "<http://xmlns.com/foaf/0.1/Organization>", + # }, + "software": { + "file": "software.md", + "uri": "<http://schema.org/SoftwareSourceCode>", + }, + "service": { + "file": "service.md", + "uri": "<http://www.w3.org/ns/sparql-service-description#Service>", + }, + "data_service": { + "file": "data_service.md", + "uri": "<http://www.w3.org/ns/dcat#DataService>", + }, + "aggregator": { + "file": "aggregator.md", + "uri": "<http://nfdi4earth.de/ontology/Aggregator>", + }, + "person": { + "file": "person.md", + "uri": "<http://schema.org/Person>", + }, + "registry": { + "file": "registry.md", + "uri": "<http://nfdi4earth.de/ontology/Registry>", + }, +} diff --git a/scripts/kg_analysis/metrics_runner.py b/scripts/kg_analysis/metrics_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..efb2854069a2f255fe90e5107276be2dc53abbd1 --- /dev/null +++ b/scripts/kg_analysis/metrics_runner.py @@ -0,0 +1,337 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2025 TU-Dresden, ZIH +# ralf.klammer@tu-dresden.de +import logging + +import json + +from abc import ABC +from copy import deepcopy +from datetime import datetime +from pathlib import Path +from time import time +from typing import Optional + +from . import REPORTS_DIR +from .query_runner import QueryRunner +from . import ( + complexity_query_templates, + general_query_templates, + resource_query_templates, + resource_types, +) + +log = logging.getLogger(__name__) + + +class MetricsRunnerBase(ABC): + """ + Base class for metric runners that provides common functionality for: + - Query execution and result handling + - File path management + - JSON and TXT report generation + """ + + _query_file: Optional[str] = None + _output_file: Optional[str] = None + _execution_time: float = 0 + fail: bool = False + + def __init__(self, output_dir=None): + self.runner = QueryRunner() + self.base_query_path = Path("metrics") + self.base_output_path = Path(output_dir) if output_dir else REPORTS_DIR + self.base_output_path.mkdir(parents=True, exist_ok=True) + log.info(f"Running metric: {self.__class__.__name__}") + + @property + def query_file(self): + return Path(self._query_file) + + def get_query_path(self, query_file: str) -> Path: + return self.base_query_path.joinpath(query_file) + + @property + def query_path(self): + return self.get_query_path(self.query_file) + + def get_output_path(self, suffix: str = ".txt"): + filename = self._output_file or self.query_file.with_suffix(suffix) + p = self.base_output_path.joinpath(filename) + return p + + @property + def output_path(self): + log.warning("Deprecated: Use output_path_txt or output_path_json") + return self.get_output_path() + + @property + def output_path_txt(self): + return self.get_output_path(suffix=".txt") + + # @property + # def output_path_json(self): + # breakpoint() + # return self.get_output_path(suffix=".json") + + def query_metric(self, query_path: Path, **kwargs) -> Optional[int]: + """Run a metric query and the result""" + start_time = time() + result = None + try: + result = self.runner.execute_query(query_path, **kwargs) + except Exception as e: + log.error(f"Failed to run metric: {e}") + self.fail = True + self._execution_time = time() - start_time + if result and result["results"]["bindings"]: + if not result["results"]["bindings"][0].values(): + return 0 + return list(result["results"]["bindings"][0].values())[0]["value"] + return None + + def save_report(self, result) -> None: + """Run a metric query and optionally save the results.""" + + if self.fail: + log.error(f"Failed to run metric: {self.__class__.__name__}") + + with open(self.output_path_txt, "w") as f: + f.write(f"{datetime.now().isoformat(timespec="minutes")}\n") + f.write(str(result)) + if self._execution_time: + f.write( + f"\n- Execution time: {self._execution_time:.2f} seconds" + ) + log.info(f"Results saved to {self.output_path_txt}") + + def save_to_json(self, dictionary: dict, filename: str): + """ + Saves results dictionary to a JSON file with proper formatting + + Args: + dictionary (dict): Results to save + filename (str, optional): Target filename, uses default if None + """ + with open( + self.base_output_path.joinpath(filename), "w", encoding="utf-8" + ) as f: + json.dump(dictionary, f, indent=4, ensure_ascii=False) + + def _get_result(self, query, resource_type, replace_dict=None): + """ + Executes a single query and stores its result + + Args: + query (dict): Query configuration with name, file and execute flag + resource_type (str): URI of the resource type to query + replace_dict (dict, optional): Additional replacements for query templates + """ + if not query["execute"]: + return + log.info(f"Metric: {query['name']} ({query['file']})") + + # Prepare replacement dictionary + _replace_dict = {"{resource_type}": resource_type} + if replace_dict: + _replace_dict.update(replace_dict) + + # Execute query and store results + result = self.query_metric( + self.get_query_path(query["file"]), + replace_dict=_replace_dict, + ) + query["result"] = result + query["execution_time"] = round(self._execution_time, 2) + + def run(self, resource_type_uri=None): + """ + Main execution method for metric analysis. + Handles both general and resource-specific metrics. + + Args: + resource_type_uri (str, optional): URI of specific resource type. + If None, runs general metrics. + Returns: + dict: Collected metrics results + """ + # Create deep copy to avoid modifying templates + queries = deepcopy(self.query_templates) + + # Process each query definition + for query_key, query in queries.items(): + if query_key == "timestamp": + continue + + if "files" in query: + # Handle composite metrics (like edge statistics) + for metric, sub_query in query["files"].items(): + replace_dict = {} + + # Pre-calculate values needed for this metric + if "replace_dict" in sub_query and sub_query["execute"]: + for dict_element, dependency_query in sub_query[ + "replace_dict" + ].items(): + # Execute dependency query (e.g. total count for median) + total = self.query_metric( + self.get_query_path(dependency_query), + replace_dict=( + {"{resource_type}": resource_type_uri} + if resource_type_uri + else None + ), + ) + + # Convert and validate result + try: + total = int(total) + except ValueError: + log.error( + f"Failed to convert {total} to integer" + ) + continue + + # Special handling for median calculations + if total and metric == "median": + replace_dict[dict_element] = int(total) / 2 + else: + replace_dict[dict_element] = total + + # Execute the actual metric query + self._get_result( + sub_query, + resource_type_uri if resource_type_uri else None, + replace_dict=replace_dict, + ) + else: + # Handle simple metrics (single query) + self._get_result( + query, resource_type_uri if resource_type_uri else None + ) + + # Add execution timestamp + queries["timestamp"] = datetime.now().isoformat(timespec="minutes") + + return queries + + +class MetricsRunner_General(MetricsRunnerBase): + """Handles general metrics that apply to the entire knowledge graph""" + + query_templates = general_query_templates + + def run(self): + """ + Executes and saves general metrics + Returns: + dict: General metrics results + """ + output_path = "general.json" + queries = super().run() + self.save_to_json(queries, output_path) + return queries + + +class MetricsRunner_Resources(MetricsRunnerBase): + """Handles resource-specific metrics for different resource types""" + + resource_types = resource_types + query_templates = resource_query_templates + + def run(self): + """ + Executes metrics for each resource type and saves combined results + Returns: + dict: Resource-specific metrics results + """ + results = {} + # Execute metrics for each resource type + for resource_type, data in self.resource_types.items(): + log.info(f"Resource type: ###{resource_type.upper()}###") + results[resource_type] = super().run(data["uri"]) + results[resource_type]["file"] = data["file"] + + results["timestamp"] = datetime.now().isoformat(timespec="minutes") + self.save_to_json(results, "resources.json") + return results + + +class MetricsRunner_Complexity(MetricsRunnerBase): + + query_templates = complexity_query_templates + + def _calculate_category_complexity(self, category_data): + """Calculates the weighted complexity for a category""" + total = 0.0 + for metric, data in category_data["files"].items(): + if "result" in data and data["result"] is not None: + try: + value = float(data["result"]) + weight = float(data.get("weight", 1.0)) + weighted_value = value * weight + data["weighted_value"] = weighted_value + total += weighted_value + except (ValueError, TypeError) as e: + log.error(f"Error calculating {metric}: {e}") + return total + + def run(self): + """ + Executes complexity metrics and calculates the overall complexity. + This method runs the complexity metrics, calculates the weighted + complexity for each category, and updates the overall complexity. + Finally, it saves the results to a JSON file. + + Returns: + dict: Complexity metrics results + """ + # Execute the base run method to get initial query results + queries = super().run() + + # Extract schematic complexity data + schematic_complexity = queries["schematic_complexity"] + + # Calculate complexity for each category and update overall complexity + for category in [ + category + for category in ["structural_complexity", "semantic_complexity"] + ]: + # Calculate the weighted complexity for the current category + result = self._calculate_category_complexity(queries[category]) + # Update the result for the current category + schematic_complexity["files"][category]["result"] = result + schematic_complexity["files"][category]["weighted_value"] = ( + result * schematic_complexity["files"][category]["weight"] + ) + # Update the overall complexity with the weighted result + schematic_complexity["files"]["overall_complexity"][ + "result" + ] += schematic_complexity["files"][category]["weighted_value"] + + # Update the schematic complexity in the queries dictionary + queries["schematic_complexity"] = schematic_complexity + + # Save the results to a JSON file + self.save_to_json(queries, "complexity.json") + return queries + + +class MetricsRunner(ABC): + """Main entry point for executing all metrics""" + + def __init__(self, output_dir=None): + self.output_dir = output_dir + + def run(self): + """ + Executes both general and resource-specific metrics + """ + # Run general metrics for entire knowledge graph + # MetricsRunner_General(output_dir=self.output_dir).run() + + # Run metrics for specific resource types + # MetricsRunner_Resources(output_dir=self.output_dir).run() + + # Run metrics on schematic complexity + MetricsRunner_Complexity(output_dir=self.output_dir).run() diff --git a/scripts/kg_analysis/query_runner.py b/scripts/kg_analysis/query_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..c6567b7cbc681a268007526a03279ab086dc4510 --- /dev/null +++ b/scripts/kg_analysis/query_runner.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2025 TU-Dresden, ZIH +# ralf.klammer@tu-dresden.de +import logging + +import time + +from pathlib import Path +from typing import Optional, Tuple + +from SPARQLWrapper import JSON, SPARQLWrapper # type: ignore + +from . import SPARQL_ENDPOINT, SPARQL_TIMEOUT + +log = logging.getLogger(__name__) + + +class QueryRunner: + def __init__(self, timeout: int = SPARQL_TIMEOUT): + self.sparql = SPARQLWrapper(SPARQL_ENDPOINT) + self.sparql.setReturnFormat(JSON) + # Set timeout in seconds and convert to milliseconds + # as required by SPARQLWrapper.setTimout() + self.sparql.setTimeout(timeout * 1000) + + def execute_query( + self, query_path: Path, replace_dict: Optional[dict] = None + ) -> dict: + """Execute a SPARQL query from a file and return the json results""" + # Read the query from the file + with open(query_path, "r") as f: + query = f.read() + + # Replace placeholders in the query + if replace_dict: + for r_key, r_value in replace_dict.items(): + query = query.replace(r_key, str(r_value)) + # Set the query and execute + self.sparql.setQuery(query) + result = self.sparql.query().convert() + + # Check if the result is a dictionary + if isinstance(result, dict): + return result + raise ValueError("Query did not return a dictionary") + + def run_metric(self, query_path: Path) -> Optional[int]: + """Run a metric query and the result""" + result = self.execute_query(query_path) + if result and result["results"]["bindings"]: + return list(result["results"]["bindings"][0].values())[0]["value"] + + return None diff --git a/scripts/kg_analysis/table_renderer.py b/scripts/kg_analysis/table_renderer.py new file mode 100644 index 0000000000000000000000000000000000000000..38d0d01714d352ce24be956134a166f44fde1016 --- /dev/null +++ b/scripts/kg_analysis/table_renderer.py @@ -0,0 +1,253 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2025 TU-Dresden, ZIH +# ralf.klammer@tu-dresden.de +import logging + +import json + +from . import REPORTS_DIR + +log = logging.getLogger(__name__) + + +class MetricsTableRenderer: + """Class for rendering metric tables in various formats.""" + + def __init__( + self, + data=None, + table_type="general", + resource_type=None, + metric_key=None, + ): + """ + Initializes the renderer with metric data. + + Args: + data (dict): The JSON data containing the metrics + """ + self._data = data + self.table_type = table_type + self.resource_type = resource_type + self.metric_key = metric_key + self.output = [] + + def open_file(self): + """Opens the file and returns the content.""" + try: + with open( + REPORTS_DIR.joinpath(self.table_option["report"]), + "r", + encoding="utf-8", + ) as f: + return json.load(f) + except (FileNotFoundError, json.JSONDecodeError) as e: + raise f"*Error loading metrics: {e}*" + + @property + def data(self): + if not self._data: + self._data = self.open_file() + return self._data + + @property + def timestamp(self): + return self.data.get("timestamp", "unknown") + + @property + def table_option(self): + table_options = { + "resource_overview": { + "method": self._render_resource_overview, + "report": "resources.json", + }, + "resource": { + "method": lambda: self._render_resource(self.resource_type), + "report": "resources.json", + }, + "general": { + "method": self._render_general, + "report": "general.json", + }, + "complexity": { + "method": self._render_complexity, + "report": "complexity.json", + }, + } + return table_options[self.table_type] + + def render(self): + """ + Renders the table in the desired format. + + Returns: + str: The rendered markdown table + """ + self.output = [] + + # Get the appropriate rendering method based on the table type + self.table_option["method"]() + + # Append the timestamp to the output + self.output.append(f"\n*Last updated: {self.timestamp}*\n") + return "\n".join(self.output) + + def _render_horizontal_table(self, data, add_links=True): + """Renders a horizontal table for the given data.""" + # Create the table header + self.output.append("| Metric | Query (file) | Result |") + self.output.append("|--------|------|--------|") + + # Iterate over each query data for the specified resource type + for q_data in data.values(): + if "file" in q_data: + filename = ( + f"[{q_data['file']}](#{q_data['file']})" + if add_links + else q_data["file"] + ) + row = [ + q_data["name"], + filename, + str(q_data["result"]), + ] + self.output.append("| " + " | ".join(row) + " |") + elif "files" in q_data: + self.output.append(f"| **{q_data['name']}** | | |") + for sub_q_data in q_data["files"].values(): + filename = ( + f"[{sub_q_data['file']}](#{sub_q_data['file']})" + if add_links + else sub_q_data["file"] + ) + sub_row = [ + sub_q_data["name"], + filename, + str(sub_q_data["result"]), + ] + self.output.append("| " + " | ".join(sub_row) + " |") + + def _render_resource(self, resource_type): + """Renders the resource-specific table - single metric only""" + if resource_type not in self.data: + return f"*No metrics found for {resource_type}*" + + self._render_horizontal_table(self.data[resource_type]) + + def _render_general(self): + """Renders a general table - general metrics or a single metric.""" + data = ( + {self.metric_key: self.data[self.metric_key]} + if self.metric_key + else self.data + ) + # Render the horizontal table + # Add links to the file names if a metric key is specified, + # i.e. the table is for a single metric + self._render_horizontal_table(data, add_links=bool(self.metric_key)) + + def _render_resource_overview(self): + """Renders the resource overview table.""" + # Get the names of all resources + resource_names = [ + key + for key, resource in self.data.items() + if isinstance(resource, dict) + ] + ordered_resource_names = sorted(resource_names) + + metric_names = [] + rows = [] + + # Iterate over each resource to collect metric names and results + for resource_name in ordered_resource_names: + resource = self.data[resource_name] + row = [resource_name.upper()] + for metric in resource.values(): + if isinstance(metric, str): + continue + elif "files" in metric: + for sub_metric in metric["files"].values(): + if sub_metric["name"] not in metric_names: + metric_names.append(sub_metric["name"]) + row.append(sub_metric.get("result", "-") or "-") + else: + row.append(metric.get("result", "-") or "-") + if metric["name"] not in metric_names: + metric_names.append(metric["name"]) + rows.append("| " + " | ".join(row) + " |") + + # Construct the table header and rows + self.output.append(f"| Resource type | {' | '.join(metric_names)} |") + self.output.append( + f"| --- |{' | '.join(['---' for _ in metric_names])} |" + ) + self.output.extend(rows) + + def _render_complexity(self): + """Renders a complexity table.""" + # Base header for all tables + header = ["Metric", "Query", "Value", "Weight", "Weighted Value"] + + metric_type = self.metric_key + + # Add Category column for overview + if not metric_type: + header += ["Category"] + + # Create table header + self.output.append("| " + " | ".join(header) + " |") + self.output.append("| " + " | ".join(["---" for _ in header]) + " |") + + # Select categories based on type + categories = {} + if metric_type: + # Search directly or in nested structures + if metric_type in self.data: + categories[metric_type] = self.data[metric_type] + else: + for value in self.data.values(): + if ( + isinstance(value, dict) + and "files" in value + and metric_type in value["files"] + ): + categories[metric_type] = { + "name": metric_type, + "files": { + metric_type: value["files"][metric_type], + }, + } + break + else: + # Include all categories except the timestamp + categories = { + k: v for k, v in self.data.items() if k != "timestamp" + } + + # Iterate through the categories + for data in categories.values(): + if "files" not in data: + continue + + # Category header with appropriate number of empty columns + empty_cols = len(header) - 1 + if not metric_type: + self.output.append( + f"| **{data['name']}** | " + " | " * empty_cols + "|" + ) + + for metric_data in data["files"].values(): + row = [ + str(metric_data["name"]), + str(metric_data.get("file", "-")), + str(metric_data.get("result", "-")), + str(metric_data.get("weight", "-")), + str(metric_data.get("weighted_value", "-")), + ] + + # Add Category column for overview + if not metric_type: + row.insert(0, "") + + self.output.append("| " + " | ".join(row) + " |") diff --git a/scripts/kg_analysis/util.py b/scripts/kg_analysis/util.py new file mode 100644 index 0000000000000000000000000000000000000000..722ac16b7f19968b53f9905be7f06056e2ff7965 --- /dev/null +++ b/scripts/kg_analysis/util.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2025 TU-Dresden, ZIH +# ralf.klammer@tu-dresden.de +import logging + +log = logging.getLogger(__name__) + + +def cli_startup(log_level=logging.INFO, log_file=None): + log_config = dict( + level=log_level, + format="%(asctime)s %(name)-10s %(levelname)-4s %(message)s", + ) + if log_file: + log_config["filename"] = log_file + + logging.basicConfig(**log_config) + logging.getLogger("").setLevel(log_level) diff --git a/scripts/pyproject.toml b/scripts/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..181b2ecdce022c864ead8b88709e101870fa1292 --- /dev/null +++ b/scripts/pyproject.toml @@ -0,0 +1,28 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "kg_analysis" +version = "1.0.0b1" +description = "Knowledge Graph Analysis" +authors = [ + { name = "Ralf Klammer", email = "ralf.klammer@tu-dresden.de" } +] +requires-python = ">=3.10" +license = { text = "Copyright (C) 2025 TU-Dresden, ZIH" } +dependencies = [ + "rdflib", + "SPARQLWrapper", + "click", + "rich", +] + +[project.scripts] +kg_analysis = "kg_analysis.cli:main" + +[tool.black] +line-length = 79 + +[tool.mypy] +warn_no_return = false