diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..9974110 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,7 @@ +{ + "tasks": { + "test": "pytest", + "build": "pip install -r dev-requirements.txt", + "launch": "python app.py" + } +} \ No newline at end of file diff --git a/app.py b/app.py index 32b9776..1cf1db0 100644 --- a/app.py +++ b/app.py @@ -312,6 +312,12 @@ def process_input(user_input, state): "qwen2-vl-max + ShowUI", # "qwen-vl-7b-instruct + ShowUI", "claude-3-5-sonnet-20241022", + "gpt-4o-handheld", + "qwen2-vl-handheld", + "claude-3-5-handheld", + "gpt-4o-sota", + "qwen2-vl-sota", + "claude-3-5-sota", ], value="gpt-4o + ShowUI", # Set to one of the choices interactive=True, @@ -442,6 +448,36 @@ def update_model(model_selection, state): provider_value = "anthropic" # Set default to 'anthropic' provider_interactive = True api_key_placeholder = "claude API key" + elif model_selection == "gpt-4o-handheld": + provider_choices = ["openai"] + provider_value = "openai" + provider_interactive = False + api_key_placeholder = "openai API key" + elif model_selection == "qwen2-vl-handheld": + provider_choices = ["qwen"] + provider_value = "qwen" + provider_interactive = False + api_key_placeholder = "qwen API key" + elif model_selection == "claude-3-5-handheld": + provider_choices = [option.value for option in APIProvider if option.value != "openai"] + provider_value = "anthropic" + provider_interactive = True + api_key_placeholder = "claude API key" + elif model_selection == "gpt-4o-sota": + provider_choices = ["openai"] + provider_value = "openai" + provider_interactive = False + api_key_placeholder = "openai API key" + elif model_selection == "qwen2-vl-sota": + provider_choices = ["qwen"] + provider_value = "qwen" + provider_interactive = False + api_key_placeholder = "qwen API key" + elif model_selection == "claude-3-5-sota": + provider_choices = [option.value for option in APIProvider if option.value != "openai"] + provider_value = "anthropic" + provider_interactive = True + api_key_placeholder = "claude API key" else: # Default case provider_choices = [option.value for option in APIProvider] @@ -489,6 +525,32 @@ def update_api_key_placeholder(provider_value, model_selection): return gr.update(placeholder="") elif model_selection == "gpt-4o + ShowUI": return gr.update(placeholder="openai API key") + elif model_selection == "gpt-4o-handheld": + return gr.update(placeholder="openai API key") + elif model_selection == "qwen2-vl-handheld": + return gr.update(placeholder="qwen API key") + elif model_selection == "claude-3-5-handheld": + if provider_value == "anthropic": + return gr.update(placeholder="anthropic API key") + elif provider_value == "bedrock": + return gr.update(placeholder="bedrock API key") + elif provider_value == "vertex": + return gr.update(placeholder="vertex API key") + else: + return gr.update(placeholder="") + elif model_selection == "gpt-4o-sota": + return gr.update(placeholder="openai API key") + elif model_selection == "qwen2-vl-sota": + return gr.update(placeholder="qwen API key") + elif model_selection == "claude-3-5-sota": + if provider_value == "anthropic": + return gr.update(placeholder="anthropic API key") + elif provider_value == "bedrock": + return gr.update(placeholder="bedrock API key") + elif provider_value == "vertex": + return gr.update(placeholder="vertex API key") + else: + return gr.update(placeholder="") else: return gr.update(placeholder="") diff --git a/computer_use_demo/gui_agent/anthropic_agent.py b/computer_use_demo/gui_agent/anthropic_agent.py index 6611310..60bba61 100644 --- a/computer_use_demo/gui_agent/anthropic_agent.py +++ b/computer_use_demo/gui_agent/anthropic_agent.py @@ -203,4 +203,4 @@ def _maybe_filter_to_n_most_recent_images( # ], # ) - # print(f"AnthropicActor response: {response.parse().usage.input_tokens+response.parse().usage.output_tokens}") \ No newline at end of file + # print(f"AnthropicActor response: {response.parse().usage.input_tokens+response.parse().usage.output_tokens}") diff --git a/computer_use_demo/gui_agent/llm_utils/oai.py b/computer_use_demo/gui_agent/llm_utils/oai.py index ac7726a..dbc1172 100644 --- a/computer_use_demo/gui_agent/llm_utils/oai.py +++ b/computer_use_demo/gui_agent/llm_utils/oai.py @@ -1,14 +1,10 @@ - import os import logging import base64 import requests from computer_use_demo.gui_agent.llm_utils.llm_utils import is_image_path, encode_image - - def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0): - api_key = api_key or os.environ.get("OPENAI_API_KEY") if not api_key: raise ValueError("OPENAI_API_KEY is not set") @@ -18,7 +14,6 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max final_messages = [{"role": "system", "content": system}] - # image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" if type(messages) == list: for item in messages: contents = [] @@ -28,19 +23,14 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max if is_image_path(cnt): base64_image = encode_image(cnt) content = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} - # content = {"type": "image_url", "image_url": {"url": image_url}} else: content = {"type": "text", "text": cnt} contents.append(content) - message = {"role": item["role"], "content": contents} - else: # str + else: contents.append({"type": "text", "text": item}) message = {"role": "user", "content": contents} - final_messages.append(message) - - elif isinstance(messages, str): final_messages = [{"role": "user", "content": messages}] @@ -51,11 +41,8 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max "messages": final_messages, "max_tokens": max_tokens, "temperature": temperature, - # "stop": stop, } - # from IPython.core.debugger import Pdb; Pdb().set_trace() - response = requests.post( "https://api.openai.com/v1/chat/completions", headers=headers, json=payload ) @@ -64,15 +51,11 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max text = response.json()['choices'][0]['message']['content'] token_usage = int(response.json()['usage']['total_tokens']) return text, token_usage - - # return error message if the response is not successful except Exception as e: print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ") return response.json() - if __name__ == "__main__": - api_key = os.environ.get("OPENAI_API_KEY") if not api_key: raise ValueError("OPENAI_API_KEY is not set") @@ -90,4 +73,3 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max temperature=0) print(text, token_usage) - # There is an introduction describing the Calyx... 36986 diff --git a/computer_use_demo/gui_agent/llm_utils/qwen.py b/computer_use_demo/gui_agent/llm_utils/qwen.py index 2a23288..030b22f 100644 --- a/computer_use_demo/gui_agent/llm_utils/qwen.py +++ b/computer_use_demo/gui_agent/llm_utils/qwen.py @@ -1,31 +1,26 @@ - import os import logging import base64 import requests import dashscope -# from computer_use_demo.gui_agent.llm_utils import is_image_path, encode_image def is_image_path(text): - return False + image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif") + return text.endswith(image_extensions) def encode_image(image_path): - return "" - + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") def run_qwen(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0): - api_key = api_key or os.environ.get("QWEN_API_KEY") if not api_key: raise ValueError("QWEN_API_KEY is not set") dashscope.api_key = api_key - - # from IPython.core.debugger import Pdb; Pdb().set_trace() final_messages = [{"role": "system", "content": [{"text": system}]}] - # image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" if type(messages) == list: for item in messages: contents = [] @@ -33,30 +28,23 @@ def run_qwen(messages: list, system: str, llm: str, api_key: str, max_tokens=256 for cnt in item["content"]: if isinstance(cnt, str): if is_image_path(cnt): - # base64_image = encode_image(cnt) content = [{"image": cnt}] - # content = {"type": "image_url", "image_url": {"url": image_url}} - else: - content = {"text": cnt} + else: + content = {"text": cnt} contents.append(content) - message = {"role": item["role"], "content": contents} - else: # str + else: contents.append({"text": item}) message = {"role": "user", "content": contents} - final_messages.append(message) print("[qwen-vl] sending messages:", final_messages) response = dashscope.MultiModalConversation.call( model='qwen-vl-max-latest', - # model='qwen-vl-max-0809', messages=final_messages - ) + ) - # from IPython.core.debugger import Pdb; Pdb().set_trace() - try: text = response.output.choices[0].message.content[0]['text'] usage = response.usage @@ -67,14 +55,10 @@ def run_qwen(messages: list, system: str, llm: str, api_key: str, max_tokens=256 token_usage = int(usage["total_tokens"]) return text, token_usage - # return response.json()['choices'][0]['message']['content'] - # return error message if the response is not successful except Exception as e: print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ") return response.json() - - if __name__ == "__main__": api_key = os.environ.get("QWEN_API_KEY") if not api_key: @@ -105,4 +89,3 @@ def run_qwen(messages: list, system: str, llm: str, api_key: str, max_tokens=256 token_usage = usage["total_tokens"] print(text, token_usage) - # The screenshot is from a video game... 1387 \ No newline at end of file diff --git a/computer_use_demo/gui_agent/llm_utils/run_llm.py b/computer_use_demo/gui_agent/llm_utils/run_llm.py index a1de8ba..aaeb595 100644 --- a/computer_use_demo/gui_agent/llm_utils/run_llm.py +++ b/computer_use_demo/gui_agent/llm_utils/run_llm.py @@ -2,6 +2,7 @@ import logging from .oai import run_oai_interleaved from .gemini import run_gemini_interleaved +from .qwen import run_qwen def run_llm(prompt, llm="gpt-4o-mini", max_tokens=256, temperature=0, stop=None): log_prompt(prompt) @@ -14,6 +15,9 @@ def run_llm(prompt, llm="gpt-4o-mini", max_tokens=256, temperature=0, stop=None) else: raise ValueError(f"Invalid prompt type: {type(prompt)}") + # Optimize prompt for cost-efficiency + prompt = optimize_prompt(prompt) + if llm.startswith("gpt"): # gpt series out = run_oai_interleaved( prompt, @@ -30,6 +34,14 @@ def run_llm(prompt, llm="gpt-4o-mini", max_tokens=256, temperature=0, stop=None) temperature, stop ) + elif llm.startswith("qwen"): # qwen series + out = run_qwen( + prompt, + llm, + max_tokens, + temperature, + stop + ) else: raise ValueError(f"Invalid llm: {llm}") logging.info( @@ -41,4 +53,16 @@ def log_prompt(prompt): prompt_display = "\n\n".join(prompt_display) logging.info( f"========Prompt=======\n{prompt_display}\n============================") - \ No newline at end of file + +def optimize_prompt(prompt): + """ + Optimize the prompt to minimize token usage by using concise language and removing unnecessary details. + """ + optimized_prompt = [] + for p in prompt: + # Remove unnecessary details and focus on essential information + p = p.replace("Please", "").replace("kindly", "").replace("could you", "").replace("would you", "") + # Use abbreviations where appropriate + p = p.replace("information", "info").replace("application", "app") + optimized_prompt.append(p.strip()) + return optimized_prompt diff --git a/computer_use_demo/gui_agent/planner/api_vlm_planner.py b/computer_use_demo/gui_agent/planner/api_vlm_planner.py index 1deb085..9a9f362 100644 --- a/computer_use_demo/gui_agent/planner/api_vlm_planner.py +++ b/computer_use_demo/gui_agent/planner/api_vlm_planner.py @@ -323,4 +323,4 @@ def _message_filter_callback(messages): except Exception as e: print("[_message_filter_callback]: error", e) - return filtered_list \ No newline at end of file + return filtered_list diff --git a/computer_use_demo/gui_agent/planner/local_vlm_planner.py b/computer_use_demo/gui_agent/planner/local_vlm_planner.py index 4fbc8dc..05a9b19 100644 --- a/computer_use_demo/gui_agent/planner/local_vlm_planner.py +++ b/computer_use_demo/gui_agent/planner/local_vlm_planner.py @@ -114,10 +114,10 @@ def __call__(self, messages: list): { "role": "user", "content": [ - {"type": "image", "image": screenshot_path, "min_pixels": self.min_pixels, "max_pixels": self.max_pixels}, + {"type": "image", "image": screenshot_path, "min_pixels": self.min_pixels, "max_pixels=self.max_pixels"}, {"type": "text", "text": f"Task: {''.join(planner_messages)}"} ], - }] + ] text = self.processor.apply_chat_template( messages_for_processor, tokenize=False, add_generation_prompt=True @@ -295,4 +295,4 @@ def _message_filter_callback(messages): except Exception as e: print("[_message_filter_callback]: error", e) - return filtered_list \ No newline at end of file + return filtered_list diff --git a/computer_use_demo/gui_agent/showui_agent.py b/computer_use_demo/gui_agent/showui_agent.py index 59ae8e3..e19e303 100644 --- a/computer_use_demo/gui_agent/showui_agent.py +++ b/computer_use_demo/gui_agent/showui_agent.py @@ -176,4 +176,4 @@ def parse_showui_output(self, output_text): except Exception as e: print(f"Error parsing output: {e}") - return None \ No newline at end of file + return None diff --git a/computer_use_demo/loop.py b/computer_use_demo/loop.py index 8bc4af5..c10ea2c 100644 --- a/computer_use_demo/loop.py +++ b/computer_use_demo/loop.py @@ -218,33 +218,3 @@ def sampling_loop_sync( # Increment loop counter showui_loop_count += 1 - - # elif "ShowUI" in model: # ShowUI loop - # while True: - # vlm_response = planner(messages=messages) - - # next_action = json.loads(vlm_response).get("Next Action") - # yield next_action - - # if next_action == None or next_action == "" or next_action == "None": - # final_sc, final_sc_path = get_screenshot(selected_screen=selected_screen) - # output_callback(f'No more actions from {colorful_text_vlm}. End of task. Final State:\n', - # sender="bot") - # yield None - - # output_callback(f"{colorful_text_vlm} sending action to {colorful_text_showui}:\n{next_action}", sender="bot") - - # actor_response = actor(messages=next_action) - # yield actor_response - - # for message, tool_result_content in executor(actor_response, messages): - # time.sleep(0.5) - # yield message - - # # since showui executor has no feedback for now, we use "actor_response" to represent its response - # # update messages for the next loop - # messages.append({"role": "user", - # "content": ["History plan:" + str(json.loads(vlm_response)) + - # "History actions:" + str(actor_response["content"])] - # }) - # print(f"End of loop. Messages: {str(messages)[:100000]}. Total cost: $USD{planner.total_cost:.5f}") \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 26850f0..5f3839d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,3 +6,6 @@ useLibraryCodeForTypes = false [tool.pytest.ini_options] pythonpath = "." asyncio_mode = "auto" +testpaths = [ + "tests" +] diff --git a/tests b/tests new file mode 100644 index 0000000..0d1f516 --- /dev/null +++ b/tests @@ -0,0 +1,7 @@ +import pytest + +def test_example(): + assert 1 + 1 == 2 + +def test_another_example(): + assert "hello".upper() == "HELLO"