Files
ItemSense/python/main.py
2026-01-21 15:41:18 -05:00

288 lines
12 KiB
Python

import sys
import cv2
import numpy as np
from PIL import Image
import objc
import threading
import base64
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from AppKit import (
NSApplication, NSApp, NSWindow, NSView, NSImageView, NSButton,
NSStackView, NSImage, NSBitmapImageRep, NSBackingStoreBuffered,
NSWindowStyleMaskTitled, NSWindowStyleMaskClosable,
NSWindowStyleMaskResizable, NSWindowStyleMaskMiniaturizable,
NSTimer, NSMakeSize, NSMakeRect, NSObject, NSLog,
NSUserInterfaceLayoutOrientationVertical, NSSplitView,
NSLayoutAttributeCenterX, NSLayoutAttributeCenterY,
NSLayoutAttributeWidth, NSLayoutAttributeHeight,
NSLayoutAttributeTop, NSLayoutAttributeBottom, NSLayoutAttributeLeading,
NSLayoutAttributeTrailing, NSScrollView, NSTextView,
NSApplicationActivationPolicyRegular, NSFont
)
from WebKit import WKWebView, WKWebViewConfiguration
from Foundation import NSObject, NSTimer, NSDate, NSURL, NSURLRequest
load_dotenv()
from PyObjCTools import AppHelper
class ItemSenseApp(NSObject):
def applicationDidFinishLaunching_(self, notification):
try:
print("Application did finish launching...")
# Increased width for split view (1200px width)
self.window = NSWindow.alloc().initWithContentRect_styleMask_backing_defer_(
NSMakeRect(0, 0, 1200, 600),
NSWindowStyleMaskTitled | NSWindowStyleMaskClosable | NSWindowStyleMaskResizable | NSWindowStyleMaskMiniaturizable,
NSBackingStoreBuffered,
False
)
self.window.setTitle_("ItemSense")
self.window.center()
# Main Split View (Horizontal)
self.split_view = NSSplitView.alloc().initWithFrame_(self.window.contentView().bounds())
self.split_view.setVertical_(True)
self.split_view.setDividerStyle_(1) # NSSplitViewDividerStyleThin
self.window.setContentView_(self.split_view)
# Left Pane (Camera + Controls + Description)
self.left_pane = NSStackView.alloc().init()
self.left_pane.setOrientation_(NSUserInterfaceLayoutOrientationVertical)
self.left_pane.setSpacing_(10)
self.left_pane.setEdgeInsets_((10, 10, 10, 10))
# Set a minimum width for the left pane so it doesn't disappear
self.left_pane.setTranslatesAutoresizingMaskIntoConstraints_(False)
self.left_pane.widthAnchor().constraintGreaterThanOrEqualToConstant_(400.0).setActive_(True)
self.split_view.addArrangedSubview_(self.left_pane)
# Image View for Camera Feed
self.image_view = NSImageView.alloc().init()
self.image_view.setImageScaling_(0) # NSImageScaleProportionallyDown
self.left_pane.addView_inGravity_(self.image_view, 1) # Top gravity
# Result View (Scrollable Text)
self.scroll_view = NSScrollView.alloc().init()
self.scroll_view.setHasVerticalScroller_(True)
self.scroll_view.setBorderType_(2) # NSBezelBorder
# Text View
content_size = self.scroll_view.contentSize()
self.text_view = NSTextView.alloc().initWithFrame_(NSMakeRect(0, 0, content_size.width, content_size.height))
self.text_view.setMinSize_(NSMakeSize(0.0, content_size.height))
self.text_view.setMaxSize_(NSMakeSize(float("inf"), float("inf")))
self.text_view.setVerticallyResizable_(True)
self.text_view.setHorizontallyResizable_(False)
self.text_view.setAutoresizingMask_(18) # NSViewWidthSizable | NSViewHeightSizable
self.text_view.textContainer().setContainerSize_(NSMakeSize(content_size.width, float("inf")))
self.text_view.textContainer().setWidthTracksTextView_(True)
self.text_view.setEditable_(False)
self.text_view.setRichText_(False)
self.text_view.setFont_(NSFont.systemFontOfSize_(18.0))
self.scroll_view.setDocumentView_(self.text_view)
self.left_pane.addView_inGravity_(self.scroll_view, 2)
# Constraint: Give the scroll view a minimum height
self.scroll_view.setTranslatesAutoresizingMaskIntoConstraints_(False)
self.scroll_view.heightAnchor().constraintGreaterThanOrEqualToConstant_(150.0).setActive_(True)
self.scroll_view.widthAnchor().constraintEqualToAnchor_constant_(self.left_pane.widthAnchor(), -20.0).setActive_(True)
self.text_view.setString_("Initializing camera...")
# Capture Button
self.capture_button = NSButton.buttonWithTitle_target_action_("Capture", self, "captureClicked:")
self.left_pane.addView_inGravity_(self.capture_button, 3) # Bottom gravity
# Right Pane (WebView)
config = WKWebViewConfiguration.alloc().init()
self.web_view = WKWebView.alloc().initWithFrame_configuration_(NSMakeRect(0, 0, 500, 600), config)
self.split_view.addArrangedSubview_(self.web_view)
self.window.makeKeyAndOrderFront_(None)
self.window.orderFrontRegardless()
# Set Split View Divider Position and Priority
# Priority 251 > 250 (default), so left pane resists resizing.
self.split_view.setHoldingPriority_forSubviewAtIndex_(251.0, 0)
self.split_view.setHoldingPriority_forSubviewAtIndex_(249.0, 1)
self.split_view.setPosition_ofDividerAtIndex_(660.0, 0)
print("Window ordered front.")
# State
self.is_capturing = True
self.current_frame = None
# Initialize Camera with a delay to allow UI to render first
self.performSelector_withObject_afterDelay_("initCamera:", None, 0.5)
except Exception as e:
import traceback
traceback.print_exc()
print(f"Error in applicationDidFinishLaunching: {e}")
def initCamera_(self, sender):
print("Initializing camera...")
self.cap = cv2.VideoCapture(0)
self.cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
self.cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
if not self.cap.isOpened():
NSLog("Error: Could not open camera")
self.text_view.setString_("Error: Could not open camera.")
return
print("Camera opened.")
self.text_view.setString_("Ready to capture")
# Start Timer for 30 FPS
self.timer = NSTimer.scheduledTimerWithTimeInterval_target_selector_userInfo_repeats_(
1.0/30.0, self, "updateFrame:", None, True
)
def applicationShouldTerminateAfterLastWindowClosed_(self, sender):
return True
def applicationWillTerminate_(self, notification):
if hasattr(self, 'cap') and self.cap.isOpened():
self.cap.release()
def updateFrame_(self, timer):
if not self.is_capturing:
return
if hasattr(self, 'cap') and self.cap.isOpened():
ret, frame = self.cap.read()
if ret:
self.current_frame = frame # Store BGR frame
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
height, width, channels = rgb_frame.shape
header = f"P6 {width} {height} 255 ".encode()
data = header + rgb_frame.tobytes()
# NSData creation from bytes
ns_data = objc.lookUpClass("NSData").dataWithBytes_length_(data, len(data))
ns_image = NSImage.alloc().initWithData_(ns_data)
self.image_view.setImage_(ns_image)
def captureClicked_(self, sender):
if self.is_capturing:
print("Capture clicked - Processing...")
self.is_capturing = False
self.capture_button.setTitle_("Processing...")
self.capture_button.setEnabled_(False)
self.text_view.setString_("Analyzing image...")
# Start background processing
threading.Thread(target=self.processImage).start()
def resetScan_(self, sender):
print("Resetting...")
self.text_view.setString_("")
self.capture_button.setTitle_("Capture")
self.capture_button.setAction_("captureClicked:")
self.is_capturing = True
# Clear Web View (optional, or load about:blank)
url = NSURL.URLWithString_("about:blank")
request = NSURLRequest.requestWithURL_(url)
self.web_view.loadRequest_(request)
def processImage(self):
try:
if self.current_frame is None:
self.performSelectorOnMainThread_withObject_waitUntilDone_("handleError:", "No frame captured", False)
return
# Encode image to base64
_, buffer = cv2.imencode('.jpg', self.current_frame)
base64_image = base64.b64encode(buffer).decode('utf-8')
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
prompt_text = (
"Identify the main item in the foreground, including the brand name if visible. Ignore the background and any people present. "
"Return a JSON object with two keys: 'description' (a brief description of the item including brand) "
"and 'search_term' (keywords to search for this item on Amazon, including brand). "
"Return ONLY the JSON. Do not wrap in markdown code blocks."
)
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt_text},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
max_tokens=300
)
result_text = response.choices[0].message.content
self.performSelectorOnMainThread_withObject_waitUntilDone_("handleResponse:", result_text, False)
except Exception as e:
self.performSelectorOnMainThread_withObject_waitUntilDone_("handleError:", str(e), False)
def handleResponse_(self, result):
print(f"OpenAI Response received: {result}")
try:
# Clean up result if it contains markdown formatting
clean_result = result.replace("```json", "").replace("```", "").strip()
data = json.loads(clean_result)
description = data.get("description", "No description found.")
search_term = data.get("search_term", "")
self.text_view.setString_(description)
if search_term:
search_query = search_term.replace(" ", "+")
amazon_url = f"https://www.amazon.com/s?k={search_query}"
print(f"Loading Amazon URL: {amazon_url}")
url = NSURL.URLWithString_(amazon_url)
request = NSURLRequest.requestWithURL_(url)
self.web_view.loadRequest_(request)
else:
print("No search term found.")
except json.JSONDecodeError:
print("Failed to parse JSON response")
self.text_view.setString_(f"Error parsing response: {result}")
except Exception as e:
print(f"Error handling response: {e}")
self.text_view.setString_(f"Error: {e}")
self.capture_button.setTitle_("Scan Another")
self.capture_button.setEnabled_(True)
self.capture_button.setAction_("resetScan:")
def handleError_(self, error_msg):
print(f"Error: {error_msg}")
self.text_view.setString_(f"Error: {error_msg}")
self.capture_button.setTitle_("Error - Try Again")
self.capture_button.setEnabled_(True)
self.capture_button.setAction_("captureClicked:") # Ensure it resets to capture logic
self.is_capturing = True
if __name__ == "__main__":
app = NSApplication.sharedApplication()
app.setActivationPolicy_(NSApplicationActivationPolicyRegular)
delegate = ItemSenseApp.alloc().init()
app.setDelegate_(delegate)
app.activateIgnoringOtherApps_(True)
AppHelper.runEventLoop()