Skip to content
Snippets Groups Projects

split embed stored retrieved

Merged Jamal Rnjbal requested to merge 20-splitter-retriever into main
2 files
+ 236
55
Compare changes
  • Side-by-side
  • Inline
Files
2
+ 86
55
@@ -7,87 +7,118 @@ import os
from openai import OpenAI
from loguru import logger
import json
import pandas as pd
from kg_maker import KgMaker
from dotenv import load_dotenv, find_dotenv
import os
from openai import OpenAI
from loguru import logger
import json
from networkx.readwrite import json_graph
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import networkx as nx
import matplotlib.pyplot as plt
from eventlog_handler import Event_handler
from langchain_text_splitters import RecursiveJsonSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
class KgToLLM():
prompt:str = """
You are an intelligent Process Mining expert.
You are given a knowledge graph in JSON format. The knowledge graph represents a BPMN model.
The average time is in hours.
def __init__(self, question):
template = """
You are an intelligent Process Mining assistant. You will receive context that is parts of a knowledge graph in JSON format. Your task is to answer the provided question based on this context and return a JSON that can be converted into a networkx knowledge graph.
Answer the question based only on the following context:
{context}
Here is an example question:
*Query:* What is the average time to receive an item?
*Instructions:*
*Instructions:**
- Your answer should be in JSON format.
- Provide two parts in the answer:
1. The relevant knowledge graph with the necessary nodes and links.
2. A human-understandable answer explaining the result, Be concise.
- Maintain the same JSON structure as provided for the knowledge graph.
- If the question is about how long a process takes, then show all possible paths to this process and take the average time.
*Expected Output Format:*
{
"knowledge_graph": {
1. The relevant knowledge graph with only the necessary nodes and links.
2. A human-understandable answer explaining the result.
Please give the output like this:
{{
"knowledge_graph": {{
"directed": true,
"multigraph": false,
"graph": {},
"nodes": [
{
{{
"id": "order"
},
{
}},
{{
"id": "recieve"
}
}}
],
"links": [
{
{{
"Average Time": 20,
"Frequency": 3,
"source": "order",
"target": "recieve"
}
}}
]
},
}},
"human_answer": "The average time to receive an item is 20 hours."
}
}}
Question: {question}
"""
def __init__(self, graph):
self.data=json.dumps(json_graph.node_link_data(graph), indent=2)
custom_rag_prompt = PromptTemplate.from_template(template)
logger.info(f"connect to OpenAI")
load_dotenv(find_dotenv())
api_key = os.getenv("OPENAI_API_KEY")
self.client = OpenAI(api_key=api_key)
client = ChatOpenAI(api_key=api_key,model="gpt-4o",
temperature=0,
max_tokens=None,
timeout=None,
max_retries=2,)
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large", api_key=api_key)
logger.success("Connected to OpenAI with Key")
def graph_to_json(self,graph:json_graph):
self.data = json_graph.node_link_data(graph)
# def json_splitter(self):
#
# splitter = RecursiveJsonSplitter(max_chunk_size=300)
# json_chunks = splitter.split_json(json_data=self.data)
# texts = splitter.split_text(json_data=self.data)
@staticmethod
def __format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
@staticmethod
def split(graph):
splitter = RecursiveJsonSplitter(max_chunk_size=300)
json_chunks = splitter.split_json(json_data=graph, convert_lists = True)
sub_graphs = [Document(page_content=json.dumps(chunk)) for chunk in json_chunks]
return sub_graphs
def store_embedd(self, sub_graphs):
db = FAISS.from_documents(sub_graphs, self.embedding_model)
self.retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 6})
embedded_sub_graphs = self.retriever.invoke(self.question)
return embedded_sub_graphs
# return texts
def ask_question(self, query:str):
messages=[
{"role": "system", "content": self.prompt},
{"role": "user", "content": self.data},
{"role": "user", "content": query}
]
response = self.client.chat.completions.create(
model="gpt-4o",
response_format={ "type": "json_object" },
messages=messages
)
def llm(self):
chain = (
{"context": self.retriever | self.format_docs, "question": RunnablePassthrough()}
| self.custom_rag_prompt
| self.client
| JsonOutputParser()
)
answer = json.loads(response.choices[0].message.content)
answer = chain.invoke(self.question)
print(answer)
knowledge_graph = answer['knowledge_graph']
human_answer = answer['human_answer']
G = nx.DiGraph()
return knowledge_graph, human_answer
return answer
\ No newline at end of file
class Document:
def __init__(self, page_content, metadata=None):
self.page_content = page_content
self.metadata = metadata if metadata else {}
Loading