From 8c89960299af7dd6635111d5beca2b6561e54bba Mon Sep 17 00:00:00 2001
From: Ben Xu <benx.xu@mail.utoronto.ca>
Date: Wed, 1 Jan 2025 04:19:03 -0500
Subject: [PATCH] revert anticipation to default

---
 .../source/server/livekit/anticipation.py     | 56 +++++++++++--------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/software/source/server/livekit/anticipation.py b/software/source/server/livekit/anticipation.py
index e41e008..cd779e6 100644
--- a/software/source/server/livekit/anticipation.py
+++ b/software/source/server/livekit/anticipation.py
@@ -3,7 +3,7 @@ import json
 import base64
 import traceback
 import io
-import os
+import re
 from PIL import Image as PIL_Image
 
 from openai import OpenAI
@@ -20,15 +20,18 @@ INSTRUCTIONS_PROMPT = """Given the conversation context and the current video fr
 Rate the severity of violation from 0-10, where 10 is most severe.
 
 Instructions to check:
-1. Ensure that the screenshot is NOT YOUTUBE or other video content
-
-Respond in the following JSON format:
-{
-    "violation_detected": boolean,
-    "severity_rating": number,
-    "violation_summary": string,
-    "recommendations": string
-}
+1. Ensure that there is no one in the frame.
+
+"""
+
+RESPONSE_FORMAT = """
+    Respond in the following JSON format:
+    {
+        "violation_detected": boolean,
+        "severity_rating": number,
+        "violation_summary": string,
+        "recommendations": string
+    }
 """
 
 
@@ -53,7 +56,7 @@ async def handle_instruction_check(
             log_message(f"Violation detected with severity {result['severity_rating']}, triggering assistant response")
             
             # Append violation to chat context
-            violation_text = f"For the given instructions: {INSTRUCTIONS_PROMPT}\n. Instruction violation frame detected: {result['violation_summary']}\nRecommendations: {result['recommendations']}"
+            violation_text = f"Instruction violation frame detected: {result['violation_summary']}\nRecommendations: {result['recommendations']}"
             assistant.chat_ctx.append(
                 role="user",
                 text=violation_text
@@ -75,12 +78,16 @@ async def handle_instruction_check(
 
             # TODO: instead of saying the predetermined response, we'll trigger an assistant response here
             # we can append the current video frame that triggered the violation to the chat context
-            stream = assistant.llm.chat(
-                chat_ctx=assistant.chat_ctx,
-                fnc_ctx=assistant.fnc_ctx,
-            )
+            # NOTE: this currently produces an unexpected connection error:
+            # httpcore.ConnectError: All connection attempts failed
+
+            # stream = assistant.llm.chat(
+            #     chat_ctx=assistant.chat_ctx,
+            #     fnc_ctx=assistant.fnc_ctx,
+            # )
 
-            await assistant.say(stream)
+             # we temporarily default back to saying the predetermined response
+            await assistant.say(violation_text)
         else:
             log_message("No significant violations detected or severity below threshold")
     except Exception as e:
@@ -93,15 +100,11 @@ async def check_instruction_violation(
     chat_ctx: ChatContext,
     video_frame: rtc.VideoFrame,
 ) -> Dict[str, Any]:
-    """Make a call to GPT-4 Vision to check for instruction violations"""
+    """Makes a call to gpt-4o-mini to check for instruction violations"""
     log_message("Creating new context for instruction check...")
     
     try:
-        # pull this from env. 
-        interpreter_server_host = os.getenv('INTERPRETER_SERVER_HOST', 'localhost')
-        interpreter_server_port = os.getenv('INTERPRETER_SERVER_PORT', '8000')
-        base_url = f"http://{interpreter_server_host}:{interpreter_server_port}/"
-        client = OpenAI(base_url)
+        client = OpenAI()
         
         try:
             # Get raw RGBA data
@@ -135,7 +138,7 @@ async def check_instruction_violation(
                     {
                         "role": "user", 
                         "content": [
-                            {"type": "text", "text": INSTRUCTIONS_PROMPT},
+                            {"type": "text", "text": INSTRUCTIONS_PROMPT + RESPONSE_FORMAT},
                             {
                                 "type": "image_url",
                                 "image_url": {
@@ -154,7 +157,12 @@ async def check_instruction_violation(
         
         try:
             # Parse the response content
-            result = json.loads(response.choices[0].message.content)
+            # Clean up the LLM response if it includes ```json ... ```
+            content = response.choices[0].message.content.strip()
+            content = re.sub(r'^```(?:json)?', '', content)  # remove leading triple backticks and optional 'json'
+            content = re.sub(r'```$', '', content).strip()   # remove trailing triple backticks
+            result = json.loads(content)
+            
             log_message(f"Successfully parsed LLM response: {json.dumps(result, indent=2)}")
             return result
         except Exception as e: