diff --git a/.env b/.env index 2a76d88..ee04edb 100644 --- a/.env +++ b/.env @@ -2,5 +2,7 @@ OPENAI_API_KEY=no-key-required OPENAI_BASE_URL=https://vllm.tipsy.codes/v1 LLM_MODEL=Intel/gemma-4-31B-it-int4-AutoRound +#LLM_BACKEND=ollama +#LLM_MODEL=gemma:2b WHISPER_MODEL=base AUDIO_DEVICE_ID=None diff --git a/src/llm/__pycache__/processor.cpython-314.pyc b/src/llm/__pycache__/processor.cpython-314.pyc index ee70c98..e142040 100644 Binary files a/src/llm/__pycache__/processor.cpython-314.pyc and b/src/llm/__pycache__/processor.cpython-314.pyc differ diff --git a/src/llm/__pycache__/prompts.cpython-314.pyc b/src/llm/__pycache__/prompts.cpython-314.pyc index b8c3acf..8d7af04 100644 Binary files a/src/llm/__pycache__/prompts.cpython-314.pyc and b/src/llm/__pycache__/prompts.cpython-314.pyc differ diff --git a/src/llm/processor.py b/src/llm/processor.py index df69d56..ee241d2 100644 --- a/src/llm/processor.py +++ b/src/llm/processor.py @@ -22,10 +22,34 @@ class LLMProcessor: :param base_url: OpenAI-compatible base URL (e.g., for vLLM). :param model: The model to use for processing. If None, it looks for LLM_MODEL in environment variables. """ - self.client = OpenAI( - api_key=api_key or os.environ.get("OPENAI_API_KEY"), - base_url=base_url or os.environ.get("OPENAI_BASE_URL"), - ) + backend = os.environ.get("LLM_BACKEND", "openai").lower() + + if backend == "ollama": + # Ollama's OpenAI-compatible API + final_base_url = base_url or "http://localhost:11434/v1" + final_api_key = api_key or "ollama" + elif backend == "vllm": + # Remote vLLM server + final_base_url = base_url or os.environ.get("OPENAI_BASE_URL") + final_api_key = api_key or os.environ.get("OPENAI_API_KEY") + else: # default to openai + final_base_url = base_url or os.environ.get("OPENAI_BASE_URL") + final_api_key = api_key or os.environ.get("OPENAI_API_KEY") + + try: + self.client = OpenAI( + api_key=final_api_key, + base_url=final_base_url, + ) + # Simple connectivity check for local backends + if backend == "ollama": + # We can't easily check connectivity without making a call, + # but we can ensure the client is initialized. + pass + except Exception as e: + print(f"Error initializing LLM client for backend {backend}: {e}") + raise + self.model = model or os.environ.get("LLM_MODEL", "gpt-4o") def _call_llm( @@ -67,6 +91,7 @@ class LLMProcessor: print(f"LLM Processor (Extract): Calling extraction for: {filtered_text}") try: # Using standard chat.completions.create with JSON mode for better compatibility with vLLM + print("LLM Processor (Extract): Sending request to backend...") response = self.client.chat.completions.create( model=self.model, messages=[ @@ -76,6 +101,7 @@ class LLMProcessor: response_format={"type": "json_object"}, extra_body={"include_reasoning": False}, ) + print("LLM Processor (Extract): Response received from backend.") import json diff --git a/src/llm/prompts.py b/src/llm/prompts.py index 42f9bf4..957beba 100644 --- a/src/llm/prompts.py +++ b/src/llm/prompts.py @@ -11,10 +11,51 @@ EXTRACTION_SYSTEM_PROMPT = """ You are a D&D session analyzer. Your goal is to extract structured data from a filtered transcript. Extract any changes to character states (HP, status effects, inventory) and any new lore facts (NPCs, locations, world-building). -Guidelines: -1. Lore: Identify any new information about the world, people, and places. -2. Character State: Look for mentions of damage, healing, or items being gained or lost. -3. Events: Note significant plot developments. +DO NOT THINK. -Be precise. If no relevant information is found, return empty lists. +CONSTRAINTS: +- OUTPUT ONLY VALID JSON. +- DO NOT include any commentary, explanations, or "thought" blocks. +- DO NOT include any keys other than "lore", "character_state", and "events". +- If no relevant information is found, return empty lists for all keys. +- If a character name is not specified (e.g., "Your character"), use "Player Character". + +Strict Output Format: +Return a JSON object with exactly these keys: +1. "lore": A list of objects. Each object MUST have: + - "category": (string) 'NPC', 'Location', 'WorldBuilding', or 'Plot' + - "entity_name": (string) The name of the NPC, Location, or entity + - "content": (string) The actual lore fact or description +2. "character_state": A list of objects. Each object MUST have: + - "character_name": (string) Name of the character + - "hp_change": (integer, optional) Change in HP + - "status_effects_added": (list of strings) + - "status_effects_removed": (list of strings) + - "inventory_changes": (list of objects with "item", "quantity", "action") +3. "events": A list of strings. Each string should be a concise description of a significant plot development. + +Example Output: +{ + "lore": [ + { + "category": "NPC", + "entity_name": "Thorne", + "content": "A gruff dwarf who runs the local tavern." + } + ], + "character_state": [ + { + "character_name": "Grog", + "hp_change": -10, + "status_effects_added": [], + "status_effects_removed": [], + "inventory_changes": [] + } + ], + "events": [ + "The party discovered the secret entrance to the crypt." + ] +} + +Be precise. Return only the JSON object. """ diff --git a/src/pipeline/__pycache__/orchestrator.cpython-314.pyc b/src/pipeline/__pycache__/orchestrator.cpython-314.pyc index 94c2e6c..377ff5d 100644 Binary files a/src/pipeline/__pycache__/orchestrator.cpython-314.pyc and b/src/pipeline/__pycache__/orchestrator.cpython-314.pyc differ diff --git a/src/pipeline/orchestrator.py b/src/pipeline/orchestrator.py index b5f54a2..aea3f2a 100644 --- a/src/pipeline/orchestrator.py +++ b/src/pipeline/orchestrator.py @@ -88,39 +88,13 @@ class PipelineOrchestrator: Worker that handles TUI: Proposal -> Persistence. """ logger.info("TUI Worker started.") - while self.is_running: - try: - # Get proposal from queue - result = await self.proposal_queue.get() - - logger.info("Proposal received. Launching TUI for confirmation.") - - # Launch TUI (Note: Textual's run() is blocking) - # We need to run the TUI in a way that doesn't block the overall event loop - # or we accept that the system pauses for confirmation. - # Given the requirement for "Non-blocking", but TUI is a focus-modal, - # we launch it. - - # To integrate Textual with asyncio, we can use its async support. - # However, ConfirmationApp is designed as a standard Textual app. - # Since we want to bridge the asyncio loop, we'll run the TUI. - - # Note: In a real high-performance pipeline, we'd use an async TUI - # that updates widgets in real-time. For now, we follow the - # a confirmation screen pattern. - - # we will use the run() method, but since we are in an async loop, - # we might need to wrap it or use an async variant. - # For this integration, we'll use the run() method as defined - # in ConfirmationApp, which will take over the terminal. - - ConfirmationApp(result).run() - - except Exception as e: - logger.error(f"TUI Worker error: {e}") - - # Small sleep - await asyncio.sleep(0.1) + try: + # Launch TUI exactly once. + # Pass the proposal queue to the app. + app = ConfirmationApp(proposal_queue=self.proposal_queue) + await app.run_async() + except Exception as e: + logger.error(f"TUI Worker error: {e}") async def run(self): """ diff --git a/src/stt/__pycache__/listener.cpython-314.pyc b/src/stt/__pycache__/listener.cpython-314.pyc index 006facc..393e9fd 100644 Binary files a/src/stt/__pycache__/listener.cpython-314.pyc and b/src/stt/__pycache__/listener.cpython-314.pyc differ diff --git a/src/stt/listener.py b/src/stt/listener.py index 16665ed..73defdc 100644 --- a/src/stt/listener.py +++ b/src/stt/listener.py @@ -3,6 +3,7 @@ import logging import numpy as np import sounddevice as sd +import torch logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -11,16 +12,56 @@ logger = logging.getLogger(__name__) class AudioListener: """ Captures audio from the microphone in chunks and puts them into an asyncio queue. + Uses Silero VAD for dynamic chunking based on speech detection. """ - def __init__(self, sample_rate=16000, chunk_duration=3, device=None, loop=None): + def __init__( + self, + sample_rate=16000, + device=None, + loop=None, + vad_threshold=0.5, + silence_duration=0.5, + max_chunk_size=30, + ): self.sample_rate = sample_rate - self.chunk_duration = chunk_duration self.device = device self.loop = loop + + # VAD Configuration + self.vad_threshold = vad_threshold + self.silence_duration = silence_duration + self.max_chunk_size = max_chunk_size + self.audio_queue = asyncio.Queue() self.is_listening = False + # Load Silero VAD model + try: + self.model, utils = torch.hub.load( + repo_or_dir="snakers4/silero-vad", model="silero_vad" + ) + self.model.eval() + if torch.cuda.is_available(): + self.model = self.model.cuda() + logger.info("Silero VAD model loaded successfully.") + except Exception as e: + logger.error(f"Failed to load Silero VAD model: {e}") + raise + + # VAD state + self.is_collecting = False + self.silence_samples = 0 + self.max_silence_samples = int(self.silence_duration * self.sample_rate) + self.max_chunk_samples = int(self.max_chunk_size * self.sample_rate) + + # Pre-padding buffer (e.g., 200ms) + self.pre_padding_samples = int(0.2 * self.sample_rate) + self._ring_buffer = np.zeros(self.pre_padding_samples, dtype=np.float32) + self._ring_buffer_idx = 0 + + self._collection_buffer = [] + def _audio_callback(self, indata, frames, time, status): """ This callback is called by sounddevice for every block of audio captured. @@ -28,25 +69,77 @@ class AudioListener: if status: logger.warning(f"SoundDevice status: {status}") - # We capture audio in chunks. sounddevice provides blocks. - # We append these blocks to a buffer until we reach chunk_duration. - self._buffer.append(indata.copy()) + # Ensure data is float32 and 1D + audio_data = indata.flatten().astype(np.float32) - # Check if we have enough data for a full chunk - current_duration = len(self._buffer) * frames / self.sample_rate - if current_duration >= self.chunk_duration: - # Concatenate all buffers into one chunk - chunk = np.concatenate(self._buffer, axis=0) - # Trim to exactly chunk_duration to maintain consistency - target_samples = int(self.sample_rate * self.chunk_duration) - chunk = chunk[:target_samples] + # 1. Update ring buffer for pre-padding + # We overwrite the oldest data in the ring buffer + num_samples = len(audio_data) + for i in range(num_samples): + self._ring_buffer[self._ring_buffer_idx] = audio_data[i] + self._ring_buffer_idx = ( + self._ring_buffer_idx + 1 + ) % self.pre_padding_samples - # Flatten to 1D array (samples,) as expected by faster-whisper - chunk = chunk.flatten() + # 2. Run VAD + # Convert to torch tensor + tensor_input = torch.from_numpy(audio_data) + if torch.cuda.is_available(): + tensor_input = tensor_input.cuda() - # Use call_soon_threadsafe to put the chunk into the asyncio queue from the callback thread - self.loop.call_soon_threadsafe(self.audio_queue.put_nowait, chunk) - self._buffer = [] + with torch.no_grad(): + # The model expects (batch, samples) + # Silero VAD expects frames of 512, 1024, or 1536 for 16kHz + # Since we use block_size=512, we are good. + probability = self.model(tensor_input.unsqueeze(0), self.sample_rate).item() + + # 3. State-based Chunking Logic + if probability > self.vad_threshold: + if not self.is_collecting: + # Start Detection: Transition to COLLECTING + logger.debug("Speech detected. Starting collection.") + self.is_collecting = True + + # Pre-padding: Append the ring buffer in the correct order + padding = np.roll(self._ring_buffer, -self._ring_buffer_idx) + self._collection_buffer.append(padding) + + # Reset silence counter + self.silence_samples = 0 + self._collection_buffer.append(audio_data) + + elif self.is_collecting: + # We are in COLLECTING state but current frame is silence + self._collection_buffer.append(audio_data) + self.silence_samples += num_samples + + # End Detection: Silence lasted longer than threshold + if self.silence_samples >= self.max_silence_samples: + logger.debug("Silence detected. Flushing chunk.") + self._flush_buffer() + # Max Chunk Size: Force flush + elif sum(len(b) for b in self._collection_buffer) >= self.max_chunk_samples: + logger.debug("Max chunk size reached. Force flushing.") + self._flush_buffer() + + else: + # IDLE state, just waiting for speech + pass + + def _flush_buffer(self): + """ + Concatenates the collection buffer and puts it into the asyncio queue. + """ + if not self._collection_buffer: + return + + chunk = np.concatenate(self._collection_buffer).flatten() + self.loop.call_soon_threadsafe(self.audio_queue.put_nowait, chunk) + + # Reset state + self._collection_buffer = [] + self.is_collecting = False + self.silence_samples = 0 def start(self): """ @@ -56,11 +149,11 @@ class AudioListener: raise RuntimeError("Event loop must be provided to AudioListener") self.is_listening = True - self._buffer = [] + self._collection_buffer = [] - # Define the block size for the callback - # We'll use a smaller block size (e.g. 0.1s) to keep the callback responsive - block_size = int(self.sample_rate * 0.1) + # Define the block size for the callback. + # Silero VAD v4 recommends 512 samples for 16kHz. + block_size = 512 try: self.stream = sd.InputStream( @@ -71,7 +164,7 @@ class AudioListener: callback=self._audio_callback, ) self.stream.start() - logger.info("Audio listener started.") + logger.info("Audio listener started with VAD-based chunking.") except Exception as e: logger.error(f"Failed to start audio listener: {e}") self.is_listening = False diff --git a/src/ui/__pycache__/tui.cpython-314.pyc b/src/ui/__pycache__/tui.cpython-314.pyc index 057531b..1064f1c 100644 Binary files a/src/ui/__pycache__/tui.cpython-314.pyc and b/src/ui/__pycache__/tui.cpython-314.pyc differ diff --git a/src/ui/tui.py b/src/ui/tui.py index 889d8ea..8e914bb 100644 --- a/src/ui/tui.py +++ b/src/ui/tui.py @@ -1,4 +1,5 @@ -from typing import List, Union +import asyncio +from typing import List, Optional, Union from textual.app import App, ComposeResult from textual.containers import Container, Horizontal, Vertical @@ -17,13 +18,13 @@ class ConfirmationApp(App): #left-pane { width: 40%; - border: solid 1; + border: solid; padding: 1; } #right-pane { width: 60%; - border: solid 1; + border: solid; padding: 1; layout: vertical; } @@ -43,7 +44,7 @@ class ConfirmationApp(App): display: none; height: auto; layout: vertical; - border: solid 1; + border: solid; padding: 1; } @@ -56,14 +57,20 @@ class ConfirmationApp(App): ("q", "quit", "Quit"), ] - def __init__(self, result: ExtractionResult): + def __init__( + self, + result: Optional[ExtractionResult] = None, + proposal_queue: Optional[asyncio.Queue] = None, + ): super().__init__() self.result = result + self.proposal_queue = proposal_queue self.pending_updates: List[Union[LoreUpdate, CharacterStateUpdate]] = [] - # Populate pending updates from result - self.pending_updates.extend(result.lore_updates) - self.pending_updates.extend(result.character_updates) + if result: + # Populate pending updates from result + self.pending_updates.extend(result.lore_updates) + self.pending_updates.extend(result.character_updates) self.selected_index = -1 @@ -100,6 +107,7 @@ class ConfirmationApp(App): def on_mount(self) -> None: table = self.query_one("#update-table", DataTable) + table.cursor_type = "row" table.add_columns("Type", "Target", "Update") for i, update in enumerate(self.pending_updates): @@ -117,8 +125,72 @@ class ConfirmationApp(App): ) table.add_row("Char", update.character_name, change_text, key=str(i)) - def on_data_table_row_selected(self, event: DataTable.RowSelected) -> None: - self.selected_index = event.row + if self.pending_updates: + self.handle_row_highlight(0) + self.query_one("#btn-accept", Button).focus() + + if self.proposal_queue: + self.run_worker(self.poll_proposal_queue, thread=False) + + async def poll_proposal_queue(self) -> None: + """ + Background worker that polls the proposal queue for new extraction results. + """ + while True: + try: + result = await self.proposal_queue.get() + self.add_result(result) + except Exception as e: + # Log error but keep the worker running + self.log(f"Error polling proposal queue: {e}") + finally: + # Signal that the item has been processed + if hasattr(self.proposal_queue, "task_done"): + self.proposal_queue.task_done() + + def add_result(self, result: ExtractionResult) -> None: + """ + Adds results from the LLM processor to the TUI table. + """ + table = self.query_one("#update-table", DataTable) + start_index = len(self.pending_updates) + + for update in result.lore_updates + result.character_updates: + self.pending_updates.append(update) + actual_index = len(self.pending_updates) - 1 + + if isinstance(update, LoreUpdate): + table.add_row( + "Lore", + update.entity_name or "General", + update.content, + key=str(actual_index), + ) + elif isinstance(update, CharacterStateUpdate): + change_text = f"HP: {update.hp_change or 0}" + if update.status_effects_added: + change_text += f", Added: {', '.join(update.status_effects_added)}" + if update.status_effects_removed: + change_text += ( + f", Removed: {', '.join(update.status_effects_removed)}" + ) + table.add_row( + "Char", update.character_name, change_text, key=str(actual_index) + ) + + # If the table was previously empty and we added updates, focus the first one. + if start_index == 0 and self.pending_updates: + self.handle_row_highlight(0) + self.query_one("#btn-accept", Button).focus() + + def on_data_table_row_highlighted(self, event: DataTable.RowHighlighted) -> None: + self.handle_row_highlight(event.cursor_row) + + def handle_row_highlight(self, row: int) -> None: + self.selected_index = row + if self.selected_index < 0 or self.selected_index >= len(self.pending_updates): + return + update = self.pending_updates[self.selected_index] details_text = self.query_one("#details-text", Static) @@ -135,7 +207,7 @@ class ConfirmationApp(App): self.query_one("#edit-container", Vertical).styles.display = "none" self.query_one("#details-container", Vertical).styles.display = "block" - def on_button_clicked(self, event: Button.Clicked) -> None: + def on_button_pressed(self, event: Button.Pressed) -> None: if self.selected_index == -1: return @@ -237,5 +309,9 @@ class ConfirmationApp(App): ) table.add_row("Char", update.character_name, change_text, key=str(i)) - self.selected_index = -1 - self.query_one("#details-text", Static).update("No update selected") + if self.pending_updates: + self.handle_row_highlight(0) + self.query_one("#btn-accept", Button).focus() + else: + self.selected_index = -1 + self.query_one("#details-text", Static).update("All updates processed.")