Doc Load

Author

Dmitriy Leybel

import os
from collections import OrderedDict
from collections.abc import Collection
from uuid import uuid4
from functools import partial
import json
import logging
import logging.config
from copy import deepcopy

import param

import panel as pn
from panel.chat import ChatMessage, ChatInterface, ChatFeed
from panel.theme import Material
from panel.reactive import ReactiveHTML

from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_core.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
    MessagesPlaceholder,
)
from langchain.schema.runnable import RunnableLambda
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.output_parsers import (
    StrOutputParser,
    JsonOutputParser,
    BaseTransformOutputParser,
)
from langchain_core.memory import BaseMemory
from langchain_core.runnables import RunnablePassthrough

from typing import Dict, List, Union, Tuple, Optional
from pydantic.v1 import BaseModel, Field


from neo4j import GraphDatabase

import tiktoken

import pandas as pd
import datetime as dt
import numpy as np
import asyncio
from dotenv import load_dotenv
import re
import time
from pprint import pprint

from ui_helpers import DraggableList
from langchain_helpers import token_len, JsonNodesEdgesStreamParser, initialize_nb, VisJSHandler, GraphHistory

from IPython.display import clear_output, display
pn.extension()
with open('logs/logging_config.json', 'r') as f:
    config = json.load(f)
logging.config.dictConfig(config)
logger = logging.getLogger('root')
with open("lean_startup_article.txt", "r") as f:
    txt = f.read()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=260, chunk_overlap=20, length_function=token_len
)
split = text_splitter.create_documents([txt])
pprint(split[5].page_content)
('When it comes time to grow your team, Jeff Jordan and Eric pointed out three '
 'hiring mistakes to avoid.\n'
 'First, do not hire someone just because they have the domain expertise. You '
 'want to be sure that they can work with the resources you can provide in '
 'terms of staff size and budgets. In other words, be sure they are ready to '
 'work at a startup. Jeff elaborated, “You want to tee [your hires] to the '
 'state of the company.” For example, don’t court public-ready CFOs when your '
 'financials aren’t even on QuickBooks yet.\n'
 'Second, don’t simply hire your buddies. “You’re kind of looking for '
 'founders,” not friends, remarked Jeff. “If you know the true story of any '
 '[startup], the early employees are every bit as entrepreneurial, every bit '
 'as dedicated … as the true founder,” Eric seconded.')

Prompting

class Node(BaseModel):
    semantic_id: str = Field(..., description="The unique identifier of the node that is reference to create edges between different nodes.")
    category: str = Field(..., description="The category of the node")
    attributes: Optional[Dict[str, Union[str, int, bool]]] = Field(None, description="Additional properties of the node")

class Edge(BaseModel):
    from_node: str = Field(..., description="The id of the node from which the edge originates. Only semantic_ids belong here, nothing else.")
    to_node: str = Field(..., description="The id of the node to which the edge connects. Only semantic_ids belong here, nothing else.")
    category: str = Field(..., description="The type of the relationship")
    attributes: Optional[Dict[str, Union[str, int, bool]]] = Field(None, description="Additional properties of the edge")

class Graph(BaseModel):
    nodes: List[Node] = Field(...,description="A list of nodes in the graph")
    edges: List[Edge] = Field(...,description="A list of edges in the graph")

# json_template = json.dumps(json_template_dict)

json_rules = \
"""We need to create a JSON object that contains a list of nodes and edges that connect the nodes.
Both, nodes and edges, have optional attributes.
Your goal is to extract as much pertinent information from the passage as possible and create nodes and edges with the extracted information.
If history is provided, it will be in the JSON schema you are given. You may create new connections between the nodes and edges in the history and the new nodes you are producing.
If you wish to change/update any of the node attributes in the provided history based on newly gathered information, simply reuse the semantic_ids of the nodes you wish to change.
If you wish to modify/update the edge attributes in the history, reuse the semantic_ids of the 'from' and 'to' nodes of any edge you wish to change.
Use the following schema and make sure to read the descriptions:
""" 

json_prompt_instructions = json_rules + Graph.schema_json() + "\n-----\n"

graph_creator_content = \
"""You are a brilliant and efficient creator of json objects that capture the essence of passages and who follows instructions unbelievably well.
You will be first given instructions and a json schema, then you will be provided a passage to extract the information from.
Your instructions are:
{instructions}
History:
{history}
"""
graph_analyst_prompt = SystemMessagePromptTemplate.from_template(template=graph_creator_content,
                              partial_variables={'instructions': json_prompt_instructions})
pass_passage_content = "Below is the passage to extract the values from.\n*****\nPassage:\n{passage}"
pass_passage_template = HumanMessagePromptTemplate.from_template(pass_passage_content)

gen_template = graph_analyst_prompt + pass_passage_template
llm35 = ChatOpenAI(model='gpt-3.5-turbo-0125')
llm4 = ChatOpenAI(model='gpt-4-1106-preview')

Network Viz

# Working Class
class VisJSNetwork(ReactiveHTML):
    nodes = param.List([])
    edges = param.List([])
    # Needed for adding nodes and edges without sending all nodes and edges
    new_nodes = param.List([])
    new_edges = param.List([])

    def add_nodes(self, nodes):
        if not isinstance(nodes, List):
            nodes = [nodes]
        logger.debug(f"Adding nodes: {nodes}")
        self.new_nodes.extend(nodes)
        self.nodes.extend(nodes)
        self.param.trigger("new_nodes")
        self.new_nodes = []

    def add_edges(self, edges):
        if not isinstance(edges, List):
            edges = [edges]
        logger.debug(f"Adding edges: {edges}")
        self.new_edges.extend(edges)
        self.edges.extend(edges)
        self.param.trigger("new_edges")
        self.new_edges = []

    def add_item(self, item):
        item_type = next(iter(item.keys()))
        if item_type == 'nodes':
            self.add_nodes(item['nodes'])
        elif item_type == 'edges':
            self.add_edges(item['edges'])    

    _template = """
    <div id="mynetwork" class="mynetwork" style="height: 800px; width: 1000px; background-color: #1f2455;"></div>
    <div id="config"></div>
    <link rel="preconnect" href="https://fonts.googleapis.com" />
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
<link href="https://fonts.googleapis.com/css2?family=Sora:wght@300&display=swap" rel="stylesheet" />

    """

    _scripts = {
        "add_nodes_js": """return function(nodes) { 
for (var i = 0; i < nodes.length; i++) {
    var node = nodes[i];
    var label = "";
    var groupAdded = false;
    for (var key in node) {
        if (key != 'id' && key != 'font' && key != 'semantic_id') {  // Exclude 'id', 'font' and 'semantic_id' from the attributes
            var keyFormatted = "<b>" + key + "</b>:\\n";
            var valueFormatted = node[key].toString();
            // Split key to adhere to the 30 character limit per line
            if (keyFormatted.length > 30) {
                var keySplitIndex = keyFormatted.lastIndexOf(' ', 30);
                if (keySplitIndex == -1) keySplitIndex = 30; // Split at 30 if no spaces
                keyFormatted = keyFormatted.substring(0, keySplitIndex) + "\\n";
            }
            label += keyFormatted;
            // Split value to adhere to the 30 character limit per line
            while (valueFormatted.length > 0) {
                if (valueFormatted.length > 30) {
                    var valueSplitIndex = valueFormatted.lastIndexOf(' ', 30);
                    if (valueSplitIndex == -1) valueSplitIndex = 30; // Split at 30 if no spaces
                    label += valueFormatted.substring(0, valueSplitIndex) + "\\n";
                    valueFormatted = valueFormatted.substring(valueSplitIndex).trim();
                } else {
                    label += valueFormatted + "\\n";
                    break; // Exit loop if the rest of the value fits in one line
                }
            }
        }
    }
    // Replace special HTML characters in the label
    node['label'] = label.replace(/&/g, "&amp;").replace(/"/g, "&quot;").replace(/'/g, "&#039;");
};
network_nodes.update(nodes);
};
""",
        
        "add_edges_js": """return function(edges) { 
for (var i = 0; i < edges.length; i++) {
    var edge = edges[i];
    var label = "";
    var fromToAdded = false;
    for (var key in edge) {
        if (key != 'from' && key != 'to' && key != 'id') {  // Exclude 'from', 'to' and 'id' from the attributes
            if (key == 'label' && !fromToAdded) {
                label += "<b>" + key + "</b>:\\n" + edge[key] + "\\n";
                fromToAdded = true;
            } else {
                label += "<b>" + key + "</b>:\\n" + edge[key] + "\\n";
            }
        }
    }
    edge['label'] = label.replace(/&/g, "&amp;").replace(/"/g, "&quot;").replace(/'/g, "&#039;");
};
network_edges.update(edges);
};

""",
        "after_layout": """
network_nodes = new vis.DataSet();
self.add_nodes_js()(data.nodes);
network_edges = new vis.DataSet();
self.add_edges_js()(data.edges);
network_data = {nodes: network_nodes, edges: network_edges};

// network.fit()
var options = {
    nodes: {
        shape: 'box',
        shapeProperties: {
            borderRadius: 6
        },
        font: {
            face: 'Palatino Linotype',
            align: 'left',
            color: '#1f2455',
            multi: 'html',
            bold: {
                face: 'Lucida Sans Unicode',
                size: 12
            }
        },
        borderWidth: 3,
        color: {
            border: '#b73e3d'
        },
        shadow: {
            enabled: true,
            color: '#b73e3d',
            size: 5,
            x: 3,
            y: 3
        }
    },
    edges: {
        arrows: {
            to: {enabled: true, scaleFactor:1, type:'arrow'},
            middle: {enabled: false, scaleFactor:1, type:'arrow'},
            from: {enabled: false, scaleFactor:1, type:'arrow'}
        },
        length: 350,
        font: {
            face: 'Palatino Linotype',
            size: 12,
            color: '#e9b07d',
            strokeWidth: 0,
            strokeColor: '#000000',
            multi: 'html',
            align: 'middle',
            bold: {
                face: 'Lucida Sans Unicode',
                size: 12
                }
        },
    }
}
var network = new vis.Network(mynetwork, network_data, options)
""",
        "new_nodes": """self.add_nodes_js()(data.new_nodes)""",
        "new_edges": """self.add_edges_js()(data.new_edges)"""
    }
    __javascript__ = ["https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"]
    # __javascript_modules__= ['https://cdn.jsdelivr.net/npm/cytoscape-dom-node@1.2.0/src/index.min.js']
# TODO: Assign special colors to groups once we have a solid flow for group ID
visjs_network = VisJSNetwork()
try:
    server.stop()
finally:
    server = pn.serve(pn.Row(visjs_network), port=46582, show=True, title='Knowledge graph sheeeeeeit')

Loop over pre-generated history to create nodes/edges

import pickle
with open('graph_history.pkl', 'rb') as f:
    history = pickle.load(f)
async def async_iterable(data_list):
    for data in data_list:
        yield data

tup_list = [(str(k), v) for k,v in history.items()]

async for dat in async_iterable(tup_list):
    proc_item = VisJSHandler.process_item(dat)
    await asyncio.sleep(.5)
    visjs_network.add_item(proc_item)