Jamal Rnjbal · e3e22efa · a657d0f6 · e3e22efa
--- a/src/kg_to_llm.py

+ 86

− 55
+++ b/src/kg_to_llm.py

+ 86

− 55
 @@ -7,87 +7,118 @@ import os
 from openai import OpenAI
 from loguru import logger
 import json
+import pandas as pd
+from kg_maker import KgMaker
+from dotenv import load_dotenv, find_dotenv
+import os
+from openai import OpenAI
+from loguru import logger
+import json
+from networkx.readwrite import json_graph
+from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+import networkx as nx
+import matplotlib.pyplot as plt
+from eventlog_handler import Event_handler
+from langchain_text_splitters import RecursiveJsonSplitter
+from langchain_community.vectorstores import Chroma
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
+from langchain_core.runnables import RunnableLambda, RunnablePassthrough
+from langchain_community.vectorstores import FAISS
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import PromptTemplate

 class KgToLLM():
-    prompt:str = """
-        You are an intelligent Process Mining expert.
-        You are given a knowledge graph in JSON format. The knowledge graph represents a BPMN model.
-        The average time is in hours.
+    def __init__(self, question):
+        template = """
+        You are an intelligent Process Mining assistant. You will receive context that is parts of a knowledge graph in JSON format. Your task is to answer the provided question based on this context and return a JSON that can be converted into a networkx knowledge graph.
+        Answer the question based only on the following context:
+        {context}

-        Here is an example question:
-
-        *Query:* What is the average time to receive an item?
-
-        *Instructions:* 
+        *Instructions:** 
        - Your answer should be in JSON format.
        - Provide two parts in the answer:
-        1. The relevant knowledge graph with the necessary nodes and links.
-        2. A human-understandable answer explaining the result, Be concise. 
-        - Maintain the same JSON structure as provided for the knowledge graph.
-        - If the question is about how long a process takes, then show all possible paths to this process and take the average time.
-        *Expected Output Format:*
-
-        {
-        "knowledge_graph": {
+        1. The relevant knowledge graph with only the necessary nodes and links.
+        2. A human-understandable answer explaining the result.
+        
+        Please give the output like this:
+        {{
+        "knowledge_graph": {{
            "directed": true,
            "multigraph": false,
-            "graph": {},
            "nodes": [
-            {
+            {{
                "id": "order"
-            },
-            {
+            }},
+            {{
                "id": "recieve"
-            }
+            }}
            ],
            "links": [
-            {
+            {{
                "Average Time": 20,
                "Frequency": 3,
                "source": "order",
                "target": "recieve"
-            }
+            }}
            ]
-        },
+        }},
        "human_answer": "The average time to receive an item is 20 hours."
-        }
-
+        }}
+        Question: {question}
        """
-    
-    def __init__(self, graph):
-        self.data=json.dumps(json_graph.node_link_data(graph), indent=2)
+        custom_rag_prompt = PromptTemplate.from_template(template)
+        
        logger.info(f"connect to OpenAI")
        load_dotenv(find_dotenv())
        api_key = os.getenv("OPENAI_API_KEY")
-        self.client = OpenAI(api_key=api_key)
+        client = ChatOpenAI(api_key=api_key,model="gpt-4o",
+            temperature=0,
+            max_tokens=None,
+            timeout=None,
+            max_retries=2,)
+        embedding_model = OpenAIEmbeddings(model="text-embedding-3-large", api_key=api_key)
        logger.success("Connected to OpenAI with Key")
        
-    def graph_to_json(self,graph:json_graph):
-        self.data = json_graph.node_link_data(graph)
-    
-#    def json_splitter(self):
-#        
-#        splitter = RecursiveJsonSplitter(max_chunk_size=300)
-        # json_chunks = splitter.split_json(json_data=self.data)
-#        texts = splitter.split_text(json_data=self.data)
+    @staticmethod
+    def __format_docs(docs):
+        return "\n\n".join(doc.page_content for doc in docs)
+
+    @staticmethod
+    def split(graph):
+        splitter = RecursiveJsonSplitter(max_chunk_size=300)
+        json_chunks = splitter.split_json(json_data=graph, convert_lists = True)
+        sub_graphs = [Document(page_content=json.dumps(chunk)) for chunk in json_chunks]
+        
+        return sub_graphs
+        
+    def store_embedd(self, sub_graphs):
+        db = FAISS.from_documents(sub_graphs, self.embedding_model)
+        self.retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 6})
+        embedded_sub_graphs = self.retriever.invoke(self.question)
+        
+        return embedded_sub_graphs
        
-#        return texts
    
-    def ask_question(self, query:str):
-        messages=[
-                    {"role": "system", "content": self.prompt},
-                    {"role": "user", "content": self.data},
-                    {"role": "user", "content": query}
-                ]
-
-        response = self.client.chat.completions.create(
-                model="gpt-4o",
-                response_format={ "type": "json_object" },
-                messages=messages
-            )
+    def llm(self):
+        chain = (
+            {"context": self.retriever | self.format_docs, "question": RunnablePassthrough()}
+            | self.custom_rag_prompt
+            | self.client
+            | JsonOutputParser()
+        )

-        answer = json.loads(response.choices[0].message.content)
+        answer = chain.invoke(self.question)

-        print(answer)
+        knowledge_graph = answer['knowledge_graph']
+        human_answer = answer['human_answer']
+        
+        G = nx.DiGraph()
+        
+        return knowledge_graph, human_answer

-        return answer
 \ No newline at end of file
+class Document:
+    def __init__(self, page_content, metadata=None):
+        self.page_content = page_content
+        self.metadata = metadata if metadata else {}