Files
ItemSense/main.py

163 lines
6.3 KiB
Python

import sys
import cv2
import numpy as np
from PIL import Image
import objc
import threading
import base64
import os
from dotenv import load_dotenv
from openai import OpenAI
from AppKit import (
NSApplication, NSApp, NSWindow, NSView, NSImageView, NSButton,
NSStackView, NSImage, NSBitmapImageRep, NSBackingStoreBuffered,
NSWindowStyleMaskTitled, NSWindowStyleMaskClosable,
NSWindowStyleMaskResizable, NSWindowStyleMaskMiniaturizable,
NSTimer, NSMakeSize, NSMakeRect, NSObject, NSLog,
NSUserInterfaceLayoutOrientationVertical, NSLayoutAttributeCenterX,
NSLayoutAttributeCenterY, NSLayoutAttributeWidth, NSLayoutAttributeHeight,
NSLayoutAttributeTop, NSLayoutAttributeBottom, NSLayoutAttributeLeading,
NSLayoutAttributeTrailing
)
from Foundation import NSObject, NSTimer, NSDate
load_dotenv()
class ItemSenseApp(NSObject):
def applicationDidFinishLaunching_(self, notification):
self.window = NSWindow.alloc().initWithContentRect_styleMask_backing_defer_(
NSMakeRect(0, 0, 800, 600),
NSWindowStyleMaskTitled | NSWindowStyleMaskClosable | NSWindowStyleMaskResizable | NSWindowStyleMaskMiniaturizable,
NSBackingStoreBuffered,
False
)
self.window.setTitle_("ItemSense")
self.window.center()
# Main content view (StackView for layout)
self.stack_view = NSStackView.alloc().init()
self.stack_view.setOrientation_(NSUserInterfaceLayoutOrientationVertical)
self.stack_view.setSpacing_(10)
self.stack_view.setEdgeInsets_((10, 10, 10, 10))
self.window.setContentView_(self.stack_view)
# Image View for Camera Feed
self.image_view = NSImageView.alloc().init()
self.image_view.setImageScaling_(0) # NSImageScaleProportionallyDown
self.stack_view.addView_inGravity_(self.image_view, 1) # Top gravity
# Capture Button
self.capture_button = NSButton.buttonWithTitle_target_action_("Capture", self, "captureClicked:")
self.stack_view.addView_inGravity_(self.capture_button, 3) # Bottom gravity
self.window.makeKeyAndOrderFront_(None)
# Initialize Camera
self.cap = cv2.VideoCapture(0)
if not self.cap.isOpened():
NSLog("Error: Could not open camera")
# State
self.is_capturing = True
self.current_frame = None
# Start Timer for 30 FPS
self.timer = NSTimer.scheduledTimerWithTimeInterval_target_selector_userInfo_repeats_(
1.0/30.0, self, "updateFrame:", None, True
)
def applicationShouldTerminateAfterLastWindowClosed_(self, sender):
return True
def applicationWillTerminate_(self, notification):
if hasattr(self, 'cap') and self.cap.isOpened():
self.cap.release()
def updateFrame_(self, timer):
if not self.is_capturing:
return
if hasattr(self, 'cap') and self.cap.isOpened():
ret, frame = self.cap.read()
if ret:
self.current_frame = frame # Store BGR frame
# Convert BGR to RGB
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
height, width, channels = rgb_frame.shape
# Using PPM format is fast (uncompressed)
header = f"P6 {width} {height} 255 ".encode()
data = header + rgb_frame.tobytes()
ns_data = objc.lookUpClass("NSData").dataWithBytes_length_(data, len(data))
ns_image = NSImage.alloc().initWithData_(ns_data)
self.image_view.setImage_(ns_image)
def captureClicked_(self, sender):
if self.is_capturing:
print("Capture clicked - Processing...")
self.is_capturing = False
self.capture_button.setTitle_("Processing...")
self.capture_button.setEnabled_(False)
# Start background processing
threading.Thread(target=self.processImage).start()
def processImage(self):
try:
if self.current_frame is None:
self.performSelectorOnMainThread_withObject_waitUntilDone_("handleError:", "No frame captured", False)
return
# Encode image to base64
_, buffer = cv2.imencode('.jpg', self.current_frame)
base64_image = base64.b64encode(buffer).decode('utf-8')
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
response = client.chat.completions.create(
model="gpt-4o-mini", # Fallback to 4o-mini as 5-mini is hypothetical/beta
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What is this item? Please provide a brief description."},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
max_tokens=300
)
result_text = response.choices[0].message.content
self.performSelectorOnMainThread_withObject_waitUntilDone_("handleResponse:", result_text, False)
except Exception as e:
self.performSelectorOnMainThread_withObject_waitUntilDone_("handleError:", str(e), False)
def handleResponse_(self, result):
print(f"OpenAI Response: {result}")
# For now, just reset state (waiting for Spec 003 for UI display)
self.capture_button.setTitle_("Captured (Check Console)")
# In reality, we'd wait for user to reset, but let's re-enable reset flow in Spec 003.
# For this spec, just showing we got the response is enough.
def handleError_(self, error_msg):
print(f"Error: {error_msg}")
self.capture_button.setTitle_("Error - Try Again")
self.capture_button.setEnabled_(True)
self.is_capturing = True
if __name__ == "__main__":
app = NSApplication.sharedApplication()
delegate = ItemSenseApp.alloc().init()
app.setDelegate_(delegate)
NSApp.activateIgnoringOtherApps_(True)
app.run()