src/rag/manager.py

import os
from typing import Any, List, Optional

import chromadb
import pdfplumber
from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

from src.llm.models import ContextUpdate
from src.llm.processor import LLMProcessor


class RAGManager:
    def __init__(self, persist_dir: str = "data/rag_index"):
        self.persist_dir = persist_dir
        self.db = chromadb.PersistentClient(path=self.persist_dir)
        self.collection_name = "phb_collection"

        # Initialize Chroma Vector Store
        self.vector_store = ChromaVectorStore(
            chroma_collection=self.db.get_or_create_collection(self.collection_name)
        )

        # Initialize Storage Context
        self.storage_context = StorageContext.from_defaults(
            vector_store=self.vector_store
        )

        # Use a local HuggingFace embedding model to avoid API key issues during verification
        Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

        # Load index if it exists, otherwise initialize
        try:
            self.index = VectorStoreIndex.from_vector_store(
                self.vector_store, storage_context=self.storage_context
            )
        except Exception:
            self.index = None

    def ingest_pdf(self, pdf_path: str):
        """
        Parses a PDF, chunks it, and stores embeddings in ChromaDB.
        """
        documents = []
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                text = page.extract_text()
                if text:
                    # Create a document for each page
                    # In a real scenario, we might use a recursive character splitter
                    # but for PHB, page-level chunking is a good start.
                    doc = Document(
                        text=text, metadata={"source": f"PHB p. {i + 1}", "page": i + 1}
                    )
                    documents.append(doc)

        if not documents:
            print(f"No text extracted from {pdf_path}")
            return

        # Create index from documents
        self.index = VectorStoreIndex.from_documents(
            documents, storage_context=self.storage_context
        )
        print(f"Successfully ingested {pdf_path} into the vector store.")

    def ingest_file(self, file_path: str):
        """
        Loads a single markdown file into the index.
        """
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

        # Use the filename as the source
        source = os.path.basename(file_path)
        doc = Document(text=text, metadata={"source": source})

        # If index doesn't exist, initialize it
        if not self.index:
            self.index = VectorStoreIndex.from_documents(
                [doc], storage_context=self.storage_context
            )
        else:
            # Insert into existing index
            self.index.insert(doc)

        print(f"Successfully ingested {file_path} into the vector store.")

    def summarize_results(self, query: str, nodes: List[Any]) -> List[ContextUpdate]:
        """
        Uses an LLM to transform raw snippets into concise "insights", filtering out irrelevant content.
        """
        if not nodes:
            return []

        processor = LLMProcessor()

        # Construct the context from retrieved nodes
        context_text = "\n\n".join(
            [
                f"Source: {node.metadata.get('source', 'Unknown')}\nContent: {node.text}"
                for node in nodes
            ]
        )

        system_prompt = (
            "You are a precise research assistant. Your task is to analyze provided text snippets "
            "and extract only the information that is directly relevant to the user's query. "
            "1. If a snippet is irrelevant to the query, discard it completely. "
            "2. For relevant information, synthesize it into a concise, single-sentence 'insight'. "
            "3. Do not simply repeat the raw text; summarize it for clarity and brevity. "
            "4. If no snippets are relevant to the query, return an empty list. "
            "5. Be factual and do not hallucinate. Use only the provided snippets."
        )

        user_prompt = (
            f"Query: {query}\n\n"
            f"Snippets:\n{context_text}\n\n"
            "Return a JSON object with a key 'insights' containing a list of objects, each with 'snippet' and 'source'."
        )

        result = processor._call_llm(
            system_prompt,
            user_prompt,
            response_format={"type": "json_object"},
        )

        import json

        try:
            data = json.loads(result)
            # Expecting a format like {"insights": [{"snippet": "...", "source": "..."}, ...]}
            insights = data.get("insights", []) if isinstance(data, dict) else data

            if not insights:
                print(f"Summarization: No relevant insights found for query: {query}")

            return [
                ContextUpdate(
                    query=query, snippet=item["snippet"], source=item["source"]
                )
                for item in insights
            ]
        except (json.JSONDecodeError, KeyError, TypeError) as e:
            print(f"Summarization parsing error: {e}")
            return []

    def retrieve(
        self, query: str, top_k: int = 5, summarize: bool = False
    ) -> List[ContextUpdate]:
        """
        Retrieves the top-K most relevant snippets for a given query.
        """
        if not self.index:
            print("Index not initialized. Please ingest documents first.")
            return []

        # Create a retriever
        retriever = self.index.as_retriever(similarity_top_k=top_k)
        nodes = retriever.retrieve(query)

        if summarize:
            return self.summarize_results(query, nodes)

        results = []
        for node in nodes:
            # Extract metadata
            source = node.metadata.get("source", "Unknown Source")

            results.append(ContextUpdate(query=query, snippet=node.text, source=source))

        return results
feat: implement RAG capabilities and Context Pane integration 2026-05-26 22:07:12 -07:00			`import os`
Implement RAG summarization and context pipeline 2026-05-27 00:17:47 -07:00			`from typing import Any, List, Optional`
feat: implement RAG capabilities and Context Pane integration 2026-05-26 22:07:12 -07:00
			`import chromadb`
			`import pdfplumber`
			`from llama_index.core import Document, Settings, StorageContext, VectorStoreIndex`
			`from llama_index.embeddings.huggingface import HuggingFaceEmbedding`
			`from llama_index.vector_stores.chroma import ChromaVectorStore`

			`from src.llm.models import ContextUpdate`
Implement RAG summarization and context pipeline 2026-05-27 00:17:47 -07:00			`from src.llm.processor import LLMProcessor`
feat: implement RAG capabilities and Context Pane integration 2026-05-26 22:07:12 -07:00

			`class RAGManager:`
			`def __init__(self, persist_dir: str = "data/rag_index"):`
			`self.persist_dir = persist_dir`
			`self.db = chromadb.PersistentClient(path=self.persist_dir)`
			`self.collection_name = "phb_collection"`

			`# Initialize Chroma Vector Store`
			`self.vector_store = ChromaVectorStore(`
			`chroma_collection=self.db.get_or_create_collection(self.collection_name)`
			`)`

			`# Initialize Storage Context`
			`self.storage_context = StorageContext.from_defaults(`
			`vector_store=self.vector_store`
			`)`

			`# Use a local HuggingFace embedding model to avoid API key issues during verification`
			`Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")`

			`# Load index if it exists, otherwise initialize`
			`try:`
			`self.index = VectorStoreIndex.from_vector_store(`
			`self.vector_store, storage_context=self.storage_context`
			`)`
			`except Exception:`
			`self.index = None`

			`def ingest_pdf(self, pdf_path: str):`
			`"""`
			`Parses a PDF, chunks it, and stores embeddings in ChromaDB.`
			`"""`
			`documents = []`
			`with pdfplumber.open(pdf_path) as pdf:`
			`for i, page in enumerate(pdf.pages):`
			`text = page.extract_text()`
			`if text:`
			`# Create a document for each page`
			`# In a real scenario, we might use a recursive character splitter`
			`# but for PHB, page-level chunking is a good start.`
			`doc = Document(`
			`text=text, metadata={"source": f"PHB p. {i + 1}", "page": i + 1}`
			`)`
			`documents.append(doc)`

			`if not documents:`
			`print(f"No text extracted from {pdf_path}")`
			`return`

			`# Create index from documents`
			`self.index = VectorStoreIndex.from_documents(`
			`documents, storage_context=self.storage_context`
			`)`
			`print(f"Successfully ingested {pdf_path} into the vector store.")`

Implement RAG summarization and context pipeline 2026-05-27 00:17:47 -07:00			`def ingest_file(self, file_path: str):`
			`"""`
			`Loads a single markdown file into the index.`
			`"""`
			`with open(file_path, "r", encoding="utf-8") as f:`
			`text = f.read()`

			`# Use the filename as the source`
			`source = os.path.basename(file_path)`
			`doc = Document(text=text, metadata={"source": source})`

			`# If index doesn't exist, initialize it`
			`if not self.index:`
			`self.index = VectorStoreIndex.from_documents(`
			`[doc], storage_context=self.storage_context`
			`)`
			`else:`
			`# Insert into existing index`
			`self.index.insert(doc)`

			`print(f"Successfully ingested {file_path} into the vector store.")`

			`def summarize_results(self, query: str, nodes: List[Any]) -> List[ContextUpdate]:`
			`"""`
			`Uses an LLM to transform raw snippets into concise "insights", filtering out irrelevant content.`
			`"""`
			`if not nodes:`
			`return []`

			`processor = LLMProcessor()`

			`# Construct the context from retrieved nodes`
			`context_text = "\n\n".join(`
			`[`
			`f"Source: {node.metadata.get('source', 'Unknown')}\nContent: {node.text}"`
			`for node in nodes`
			`]`
			`)`

			`system_prompt = (`
			`"You are a precise research assistant. Your task is to analyze provided text snippets "`
			`"and extract only the information that is directly relevant to the user's query. "`
			`"1. If a snippet is irrelevant to the query, discard it completely. "`
			`"2. For relevant information, synthesize it into a concise, single-sentence 'insight'. "`
			`"3. Do not simply repeat the raw text; summarize it for clarity and brevity. "`
			`"4. If no snippets are relevant to the query, return an empty list. "`
			`"5. Be factual and do not hallucinate. Use only the provided snippets."`
			`)`

			`user_prompt = (`
			`f"Query: {query}\n\n"`
			`f"Snippets:\n{context_text}\n\n"`
			`"Return a JSON object with a key 'insights' containing a list of objects, each with 'snippet' and 'source'."`
			`)`

			`result = processor._call_llm(`
			`system_prompt,`
			`user_prompt,`
			`response_format={"type": "json_object"},`
			`)`

			`import json`

			`try:`
			`data = json.loads(result)`
			`# Expecting a format like {"insights": [{"snippet": "...", "source": "..."}, ...]}`
			`insights = data.get("insights", []) if isinstance(data, dict) else data`

			`if not insights:`
			`print(f"Summarization: No relevant insights found for query: {query}")`

			`return [`
			`ContextUpdate(`
			`query=query, snippet=item["snippet"], source=item["source"]`
			`)`
			`for item in insights`
			`]`
			`except (json.JSONDecodeError, KeyError, TypeError) as e:`
			`print(f"Summarization parsing error: {e}")`
			`return []`

			`def retrieve(`
			`self, query: str, top_k: int = 5, summarize: bool = False`
			`) -> List[ContextUpdate]:`
feat: implement RAG capabilities and Context Pane integration 2026-05-26 22:07:12 -07:00			`"""`
			`Retrieves the top-K most relevant snippets for a given query.`
			`"""`
			`if not self.index:`
			`print("Index not initialized. Please ingest documents first.")`
			`return []`

			`# Create a retriever`
			`retriever = self.index.as_retriever(similarity_top_k=top_k)`
			`nodes = retriever.retrieve(query)`

Implement RAG summarization and context pipeline 2026-05-27 00:17:47 -07:00			`if summarize:`
			`return self.summarize_results(query, nodes)`

feat: implement RAG capabilities and Context Pane integration 2026-05-26 22:07:12 -07:00			`results = []`
			`for node in nodes:`
			`# Extract metadata`
			`source = node.metadata.get("source", "Unknown Source")`

			`results.append(ContextUpdate(query=query, snippet=node.text, source=source))`

			`return results`