From 8b6e33af35a6edf52a3f4f4b9a0898b5bbf4a0fa Mon Sep 17 00:00:00 2001 From: ashdude1401 Date: Wed, 26 Feb 2025 12:46:05 +0530 Subject: [PATCH 1/9] feat: add setup script and enhance custom agent and prompts for iframe interactions --- setup.ps1 | 18 ++++++++++++++++ src/agent/custom_agent.py | 4 ++-- src/agent/custom_prompts.py | 41 +++++++++++++++++++++++++++++++++++-- 3 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 setup.ps1 diff --git a/setup.ps1 b/setup.ps1 new file mode 100644 index 00000000..6a2d69fd --- /dev/null +++ b/setup.ps1 @@ -0,0 +1,18 @@ +deactivate + +Remove-Item -Recurse -Force .venv + +# Step 2: Set Up Python Environment +uv venv --python 3.11 + +# Activate the virtual environment +.\.venv\Scripts\Activate.ps1 + +# Step 3: Install Dependencies +uv pip install -r requirements.txt +playwright install + + +# Step 4: Run web ui in local +python webui.py --ip 127.0.0.1 --port 7788 +Write-Output "Setup complete. Virtual environment activated." diff --git a/src/agent/custom_agent.py b/src/agent/custom_agent.py index bfeb33ca..6202edad 100644 --- a/src/agent/custom_agent.py +++ b/src/agent/custom_agent.py @@ -57,11 +57,11 @@ def __init__( use_vision_for_planner: bool = False, save_conversation_path: Optional[str] = None, save_conversation_path_encoding: Optional[str] = 'utf-8', - max_failures: int = 3, + max_failures: int = 5, retry_delay: int = 10, system_prompt_class: Type[SystemPrompt] = SystemPrompt, agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt, - max_input_tokens: int = 128000, + max_input_tokens: int = 1280000, validate_output: bool = False, message_context: Optional[str] = None, generate_gif: bool | str = True, diff --git a/src/agent/custom_prompts.py b/src/agent/custom_prompts.py index ab8c9a1e..6b0e9d7b 100644 --- a/src/agent/custom_prompts.py +++ b/src/agent/custom_prompts.py @@ -43,6 +43,12 @@ def important_rules(self) -> str: {"go_to_url": {"url": "https://example.com"}}, {"extract_page_content": {}} ] + - Iframe interaction: [ + {"switch_frame": {"frame_name": "GlobalNav"}}, + {"click_element": {"index": 1}}, + {"switch_frame": {"frame_name": "frameContent"}}, + {"click_element": {"index": 2}} + ] 3. ELEMENT INTERACTION: @@ -82,8 +88,39 @@ def important_rules(self) -> str: - Only provide the action sequence until you think the page will change. - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes... - only use multiple actions if it makes sense. - -9. Extraction: +9. IFrames: + - Identify iframes using their names or unique identifiers + - Switch to iframes before interacting with nested elements + - Use frame locators for element interaction within iframes + - Example action sequence for iframe interaction: + [ + {"switch_frame": {"frame_name": "GlobalNav"}}, + {"click_element": {"index": 1}}, + {"switch_frame": {"frame_name": "frameContent"}}, + {"click_element": {"index": 2}} + ] + - Always return to the main frame after iframe operations + - Handle nested iframes by chaining switch_frame actions + 10. Action Sequencing for Iframes: + - Always start iframe interactions with switch_frame + - Perform all element interactions within the iframe context + - Use back_to_main_frame after completing iframe operations + - For nested iframes, chain switch_frame actions + - Example nested iframe sequence: + [ + {"switch_frame": {"frame_name": "outerFrame"}}, + {"switch_frame": {"frame_name": "innerFrame"}}, + {"click_element": {"index": 1}}, + {"back_to_main_frame": {}} + ] + + 11. Visual Context for Iframes: + - Bounding boxes for iframe elements will have frame name labels + - Example: [GlobalNav] + - Use frame labels to identify element context + - Elements without frame labels are in the main page + +12. Extraction: - If your task is to find information or do research - call extract_content on the specific pages to get and store the information. """ From 9c427307a3fdd25efcf332025e687ab45b1f4f3d Mon Sep 17 00:00:00 2001 From: ashdude1401 Date: Wed, 26 Feb 2025 12:51:41 +0530 Subject: [PATCH 2/9] feat: add setup script and enhance custom agent and prompts for iframe interactions --- setup.ps1 | 18 ++++++++++++++++ src/agent/custom_agent.py | 4 ++-- src/agent/custom_prompts.py | 41 +++++++++++++++++++++++++++++++++++-- 3 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 setup.ps1 diff --git a/setup.ps1 b/setup.ps1 new file mode 100644 index 00000000..6a2d69fd --- /dev/null +++ b/setup.ps1 @@ -0,0 +1,18 @@ +deactivate + +Remove-Item -Recurse -Force .venv + +# Step 2: Set Up Python Environment +uv venv --python 3.11 + +# Activate the virtual environment +.\.venv\Scripts\Activate.ps1 + +# Step 3: Install Dependencies +uv pip install -r requirements.txt +playwright install + + +# Step 4: Run web ui in local +python webui.py --ip 127.0.0.1 --port 7788 +Write-Output "Setup complete. Virtual environment activated." diff --git a/src/agent/custom_agent.py b/src/agent/custom_agent.py index bfeb33ca..6202edad 100644 --- a/src/agent/custom_agent.py +++ b/src/agent/custom_agent.py @@ -57,11 +57,11 @@ def __init__( use_vision_for_planner: bool = False, save_conversation_path: Optional[str] = None, save_conversation_path_encoding: Optional[str] = 'utf-8', - max_failures: int = 3, + max_failures: int = 5, retry_delay: int = 10, system_prompt_class: Type[SystemPrompt] = SystemPrompt, agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt, - max_input_tokens: int = 128000, + max_input_tokens: int = 1280000, validate_output: bool = False, message_context: Optional[str] = None, generate_gif: bool | str = True, diff --git a/src/agent/custom_prompts.py b/src/agent/custom_prompts.py index ab8c9a1e..6b0e9d7b 100644 --- a/src/agent/custom_prompts.py +++ b/src/agent/custom_prompts.py @@ -43,6 +43,12 @@ def important_rules(self) -> str: {"go_to_url": {"url": "https://example.com"}}, {"extract_page_content": {}} ] + - Iframe interaction: [ + {"switch_frame": {"frame_name": "GlobalNav"}}, + {"click_element": {"index": 1}}, + {"switch_frame": {"frame_name": "frameContent"}}, + {"click_element": {"index": 2}} + ] 3. ELEMENT INTERACTION: @@ -82,8 +88,39 @@ def important_rules(self) -> str: - Only provide the action sequence until you think the page will change. - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes... - only use multiple actions if it makes sense. - -9. Extraction: +9. IFrames: + - Identify iframes using their names or unique identifiers + - Switch to iframes before interacting with nested elements + - Use frame locators for element interaction within iframes + - Example action sequence for iframe interaction: + [ + {"switch_frame": {"frame_name": "GlobalNav"}}, + {"click_element": {"index": 1}}, + {"switch_frame": {"frame_name": "frameContent"}}, + {"click_element": {"index": 2}} + ] + - Always return to the main frame after iframe operations + - Handle nested iframes by chaining switch_frame actions + 10. Action Sequencing for Iframes: + - Always start iframe interactions with switch_frame + - Perform all element interactions within the iframe context + - Use back_to_main_frame after completing iframe operations + - For nested iframes, chain switch_frame actions + - Example nested iframe sequence: + [ + {"switch_frame": {"frame_name": "outerFrame"}}, + {"switch_frame": {"frame_name": "innerFrame"}}, + {"click_element": {"index": 1}}, + {"back_to_main_frame": {}} + ] + + 11. Visual Context for Iframes: + - Bounding boxes for iframe elements will have frame name labels + - Example: [GlobalNav] + - Use frame labels to identify element context + - Elements without frame labels are in the main page + +12. Extraction: - If your task is to find information or do research - call extract_content on the specific pages to get and store the information. """ From 2954de3a061748b4d5bd868768833575445a1e59 Mon Sep 17 00:00:00 2001 From: ashdude1401 Date: Wed, 26 Feb 2025 13:37:01 +0530 Subject: [PATCH 3/9] feat: add step-by-step instructions for accessing Athenahealth Document Search --- prompts/prompt.text | 51 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 prompts/prompt.text diff --git a/prompts/prompt.text b/prompts/prompt.text new file mode 100644 index 00000000..41a816a7 --- /dev/null +++ b/prompts/prompt.text @@ -0,0 +1,51 @@ +Step-by-Step Instructions + +1. Navigate to Athenahealth Preview Environment + - Open a web browser and go to: https://preview.athenahealth.com/ + +2. Log In + - Enter the credentials: + - Username: p-bkumar1 + - Password: Xcaliber@12345 + - Click the Login button. + +3. Select the Default Department + - If prompted, choose the default department from the list (e.g., "7 Hills Department"). + +4. Access the "Patients" Menu + - Locate the header at the top of the dashboard. + - Click on the "Patients" menu to open the dropdown. + +5. Access Document Search + - Option 1 (Primary Attempt) + - In the dropdown, look for "Documents > Document Search" and click it. + - If the primary attempt fails (error 404 or element not found): + - Refresh the page. + - Retry clicking "Document Search" (up to 3 times with 2-second intervals). + +6. Handle Iframes (Fallback Approach) + - Use the following sequence if Document Search is nested in iframes: + - Switch to the main iframe context: + - Locate and switch to iframe[name="frMain"]. + - Switch to the sub-iframe: + - Locate and switch to iframe[id="searchFrame"] or iframe[name="frMain"] > iframe (if nested). + - Fill the DOCUMENTID and click Search: + - Enter the value "116873" in the DOCUMENTID field. + - Click the "Search" button. + - Retry up to 3 times: + - Wait 2 seconds between each retry if elements are missing. + +7. Observe and Report + - After clicking "Document Search" or executing the iframe fallback: + - The Clinical Inbox should display Jammy J. Willer's lab/imaging results, including CBC tests and MRI results with normal/abnormal statuses. + +Notes on Tier Group in Clinical Inbox +- The Tier Group level ID for the search results is 107 tasks associated with the 16.11 Testing department. + +Common Issues and Solutions +- Element Not Found: Ensure the iframe is fully loaded (wait for 5–10 seconds). +- Button Not Clickable: Refresh the page and try again. +- Network Errors: Verify your internet connection and retry the login process. + +Result +- After following the steps, the Clinical Inbox displays lab/imaging results for Jammy J. Willer with task updates. The task is completed successfully. From a46f98c7c17bc96fe6f4cb83dc481754e32fffb3 Mon Sep 17 00:00:00 2001 From: ashdude1401 Date: Wed, 26 Feb 2025 13:47:16 +0530 Subject: [PATCH 4/9] feat: update prompt instructions for Clinical Inbox and refine common issues section --- prompts/prompt.text | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/prompts/prompt.text b/prompts/prompt.text index 41a816a7..f55fc85d 100644 --- a/prompts/prompt.text +++ b/prompts/prompt.text @@ -37,15 +37,11 @@ Step-by-Step Instructions 7. Observe and Report - After clicking "Document Search" or executing the iframe fallback: - - The Clinical Inbox should display Jammy J. Willer's lab/imaging results, including CBC tests and MRI results with normal/abnormal statuses. - -Notes on Tier Group in Clinical Inbox -- The Tier Group level ID for the search results is 107 tasks associated with the 16.11 Testing department. - + Common Issues and Solutions - Element Not Found: Ensure the iframe is fully loaded (wait for 5–10 seconds). -- Button Not Clickable: Refresh the page and try again. +- Button Not Clickable: try to force click button again. - Network Errors: Verify your internet connection and retry the login process. Result -- After following the steps, the Clinical Inbox displays lab/imaging results for Jammy J. Willer with task updates. The task is completed successfully. +- After following the steps, patient lab reports will be displayed. The task is completed successfully. From b571736c8fd00b34e02712654135d60cea302901 Mon Sep 17 00:00:00 2001 From: ashdude1401 Date: Sat, 1 Mar 2025 14:51:07 +0530 Subject: [PATCH 5/9] feat: update Dockerfile and configuration for reduced resolution; modify custom agent to disable vision usage --- Dockerfile | 6 +- src/agent/custom_agent.py | 6 +- src/utils/default_config_settings.py | 2 +- supervisord.conf | 121 ++++++++++++++++++++++++--- webui.py | 25 ++++-- 5 files changed, 136 insertions(+), 24 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7b6d39fe..44d47511 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,11 +71,11 @@ ENV BROWSER_USE_LOGGING_LEVEL=info ENV CHROME_PATH=/ms-playwright/chromium-*/chrome-linux/chrome ENV ANONYMIZED_TELEMETRY=false ENV DISPLAY=:99 -ENV RESOLUTION=1920x1080x24 +ENV RESOLUTION=960x540x24 ENV VNC_PASSWORD=vncpassword ENV CHROME_PERSISTENT_SESSION=true -ENV RESOLUTION_WIDTH=1920 -ENV RESOLUTION_HEIGHT=1080 +ENV RESOLUTION_WIDTH=960 +ENV RESOLUTION_HEIGHT=540 # Set up supervisor configuration RUN mkdir -p /var/log/supervisor diff --git a/src/agent/custom_agent.py b/src/agent/custom_agent.py index 6202edad..27379cc0 100644 --- a/src/agent/custom_agent.py +++ b/src/agent/custom_agent.py @@ -53,7 +53,7 @@ def __init__( browser: Browser | None = None, browser_context: BrowserContext | None = None, controller: Controller = Controller(), - use_vision: bool = True, + use_vision: bool = False, use_vision_for_planner: bool = False, save_conversation_path: Optional[str] = None, save_conversation_path_encoding: Optional[str] = 'utf-8', @@ -281,8 +281,8 @@ async def _run_planner(self) -> Optional[str]: planner_messages[-1] = HumanMessage(content=new_msg) # Get planner output - response = await self.planner_llm.ainvoke(planner_messages) - plan = response.content + response = await self.ainvoke(planner_messages) + plan = response.contentplanner_llm last_state_message = planner_messages[-1] # remove image from last state message if isinstance(last_state_message.content, list): diff --git a/src/utils/default_config_settings.py b/src/utils/default_config_settings.py index e6fa88f9..5d7c6662 100644 --- a/src/utils/default_config_settings.py +++ b/src/utils/default_config_settings.py @@ -10,7 +10,7 @@ def default_config(): "agent_type": "custom", "max_steps": 100, "max_actions_per_step": 10, - "use_vision": True, + "use_vision": False, "tool_calling_method": "auto", "llm_provider": "openai", "llm_model_name": "gpt-4o", diff --git a/supervisord.conf b/supervisord.conf index 3410b912..a59a94e5 100644 --- a/supervisord.conf +++ b/supervisord.conf @@ -1,3 +1,100 @@ +# [supervisord] +# user=root +# nodaemon=true +# logfile=/dev/stdout +# logfile_maxbytes=0 +# loglevel=debug + +# [program:xvfb] +# command=Xvfb :99 -screen 0 %(ENV_RESOLUTION)s -ac +extension GLX +render -noreset +# autorestart=true +# stdout_logfile=/dev/stdout +# stdout_logfile_maxbytes=0 +# stderr_logfile=/dev/stderr +# stderr_logfile_maxbytes=0 +# priority=100 +# startsecs=3 +# stopsignal=TERM +# stopwaitsecs=10 + +# [program:vnc_setup] +# command=bash -c "mkdir -p ~/.vnc && echo '%(ENV_VNC_PASSWORD)s' | vncpasswd -f > ~/.vnc/passwd && chmod 600 ~/.vnc/passwd && ls -la ~/.vnc/passwd" +# autorestart=false +# startsecs=0 +# priority=150 +# stdout_logfile=/dev/stdout +# stdout_logfile_maxbytes=0 +# stderr_logfile=/dev/stderr +# stderr_logfile_maxbytes=0 + +# [program:x11vnc] +# command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && chmod 666 /var/log/x11vnc.log && sleep 5 && DISPLAY=:99 x11vnc -display :99 -forever -shared -rfbauth /root/.vnc/passwd -bg -rfbport 5901 -o /var/log/x11vnc.log" +# autorestart=true +# stdout_logfile=/dev/stdout +# stdout_logfile_maxbytes=0 +# stderr_logfile=/dev/stderr +# stderr_logfile_maxbytes=0 +# priority=200 +# startretries=10 +# startsecs=10 +# stopsignal=TERM +# stopwaitsecs=10 +# depends_on=vnc_setup,xvfb + +# [program:x11vnc_log] +# command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && tail -f /var/log/x11vnc.log" +# autorestart=true +# stdout_logfile=/dev/stdout +# stdout_logfile_maxbytes=0 +# stderr_logfile=/dev/stderr +# stderr_logfile_maxbytes=0 +# priority=250 +# stopsignal=TERM +# stopwaitsecs=5 +# depends_on=x11vnc + +# [program:novnc] +# command=bash -c "sleep 5 && cd /opt/novnc && ./utils/novnc_proxy --vnc localhost:5901 --listen 0.0.0.0:6080 --web /opt/novnc --http-header='Content-Security-Policy: frame-ancestors http://localhost:7788/'" +# autorestart=true +# stdout_logfile=/dev/stdout +# stdout_logfile_maxbytes=0 +# stderr_logfile=/dev/stderr +# stderr_logfile_maxbytes=0 +# priority=300 +# startretries=5 +# startsecs=3 +# depends_on=x11vnc + +# [program:persistent_browser] +# environment=START_URL="data:text/html,

Browser Ready

" +# command=bash -c "mkdir -p /app/data/chrome_data && sleep 8 && $(find /ms-playwright/chromium-*/chrome-linux -name chrome) --user-data-dir=/app/data/chrome_data --window-position=0,0 --window-size=%(ENV_RESOLUTION_WIDTH)s,%(ENV_RESOLUTION_HEIGHT)s --start-maximized --no-sandbox --disable-dev-shm-usage --disable-gpu --disable-software-rasterizer --disable-setuid-sandbox --no-first-run --no-default-browser-check --no-experiments --ignore-certificate-errors --remote-debugging-port=9222 --remote-debugging-address=0.0.0.0 \"$START_URL\"" +# autorestart=true +# stdout_logfile=/dev/stdout +# stdout_logfile_maxbytes=0 +# stderr_logfile=/dev/stderr +# stderr_logfile_maxbytes=0 +# priority=350 +# startretries=5 +# startsecs=10 +# stopsignal=TERM +# stopwaitsecs=15 +# depends_on=novnc + +# [program:webui] +# command=python webui.py --ip 0.0.0.0 --port 7788 +# directory=/app +# autorestart=true +# stdout_logfile=/dev/stdout +# stdout_logfile_maxbytes=0 +# stderr_logfile=/dev/stderr +# stderr_logfile_maxbytes=0 +# priority=400 +# startretries=3 +# startsecs=3 +# stopsignal=TERM +# stopwaitsecs=10 +# depends_on=persistent_browser + [supervisord] user=root nodaemon=true @@ -17,18 +114,18 @@ startsecs=3 stopsignal=TERM stopwaitsecs=10 -[program:vnc_setup] -command=bash -c "mkdir -p ~/.vnc && echo '%(ENV_VNC_PASSWORD)s' | vncpasswd -f > ~/.vnc/passwd && chmod 600 ~/.vnc/passwd && ls -la ~/.vnc/passwd" -autorestart=false -startsecs=0 -priority=150 -stdout_logfile=/dev/stdout -stdout_logfile_maxbytes=0 -stderr_logfile=/dev/stderr -stderr_logfile_maxbytes=0 +# [program:vnc_setup] +# command=bash -c "mkdir -p ~/.vnc && echo '%(ENV_VNC_PASSWORD)s' | vncpasswd -f > ~/.vnc/passwd && chmod 600 ~/.vnc/passwd && ls -la ~/.vnc/passwd" +# autorestart=false +# startsecs=0 +# priority=150 +# stdout_logfile=/dev/stdout +# stdout_logfile_maxbytes=0 +# stderr_logfile=/dev/stderr +# stderr_logfile_maxbytes=0 [program:x11vnc] -command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && chmod 666 /var/log/x11vnc.log && sleep 5 && DISPLAY=:99 x11vnc -display :99 -forever -shared -rfbauth /root/.vnc/passwd -rfbport 5901 -o /var/log/x11vnc.log" +command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && chmod 666 /var/log/x11vnc.log && sleep 5 && DISPLAY=:99 x11vnc -display :99 -nopw -forever -shared -bg -rfbport 5901 -o /var/log/x11vnc.log" autorestart=true stdout_logfile=/dev/stdout stdout_logfile_maxbytes=0 @@ -39,7 +136,7 @@ startretries=10 startsecs=10 stopsignal=TERM stopwaitsecs=10 -depends_on=vnc_setup,xvfb +depends_on=xvfb [program:x11vnc_log] command=bash -c "mkdir -p /var/log && touch /var/log/x11vnc.log && tail -f /var/log/x11vnc.log" @@ -93,4 +190,4 @@ startretries=3 startsecs=3 stopsignal=TERM stopwaitsecs=10 -depends_on=persistent_browser +depends_on=persistent_browser \ No newline at end of file diff --git a/webui.py b/webui.py index e770d99d..e1f4ae1c 100644 --- a/webui.py +++ b/webui.py @@ -897,11 +897,26 @@ def update_llm_num_ctx_visibility(llm_provider): run_button = gr.Button("▶️ Run Agent", variant="primary", scale=2) stop_button = gr.Button("⏹️ Stop", variant="stop", scale=1) + # with gr.Row(): + # browser_view = gr.HTML( + # value="

Waiting for browser session...

", + # label="Live Browser View", + # ) + with gr.Row(): - browser_view = gr.HTML( - value="

Waiting for browser session...

", - label="Live Browser View", - ) + gr.HTML( + """ + + """ + ) with gr.TabItem("🧐 Deep Research", id=5): research_task_input = gr.Textbox(label="Research Task", lines=5, value="Compose a report on the use of Reinforcement Learning for training Large Language Models, encompassing its origins, current advancements, and future prospects, substantiated with examples of relevant models and techniques. The report should reflect original insights and analysis, moving beyond mere summarization of existing literature.") @@ -961,7 +976,7 @@ def update_llm_num_ctx_visibility(llm_provider): enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step, tool_calling_method ], outputs=[ - browser_view, # Browser view + # browser_view, # Browser view final_result_output, # Final result errors_output, # Errors model_actions_output, # Model actions From 50a033ecc4a41f09a5ac2407df411b8bfa33c9bf Mon Sep 17 00:00:00 2001 From: prathamxcaliber Date: Sun, 2 Mar 2025 11:47:41 +0530 Subject: [PATCH 6/9] UI fixes for layout --- webui.py | 1129 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 801 insertions(+), 328 deletions(-) diff --git a/webui.py b/webui.py index e1f4ae1c..2ad1c397 100644 --- a/webui.py +++ b/webui.py @@ -68,6 +68,12 @@ def resolve_sensitive_env_variables(text): return result +def open_modal(): + return gr.update(visible=True) + +def close_modal(): + return gr.update(visible=False) + async def stop_agent(): """Request the agent to stop and update UI with enhanced feedback""" global _global_agent_state, _global_browser_context, _global_browser, _global_agent @@ -703,379 +709,846 @@ def create_ui(config, theme_name="Ocean"): """ with gr.Blocks( - title="Browser Use WebUI", theme=theme_map[theme_name], css=css + title="EHR Operator", theme=theme_map[theme_name], css="body { display: flex; justify-content: center; } #main-container { max-width: 1200px; width: 100%; }" ) as demo: with gr.Row(): gr.Markdown( """ - # 🌐 Browser Use WebUI - ### Control your browser with AI assistance +

🌐 EHR Operator

+

Control your browser with AI assistance

""", elem_classes=["header-text"], ) - with gr.Tabs() as tabs: - with gr.TabItem("⚙️ Agent Settings", id=1): - with gr.Group(): - agent_type = gr.Radio( - ["org", "custom"], - label="Agent Type", - value=config['agent_type'], - info="Select the type of agent to use", - ) - with gr.Column(): - max_steps = gr.Slider( - minimum=1, - maximum=200, - value=config['max_steps'], - step=1, - label="Max Run Steps", - info="Maximum number of steps the agent will take", - ) - max_actions_per_step = gr.Slider( - minimum=1, - maximum=20, - value=config['max_actions_per_step'], - step=1, - label="Max Actions per Step", - info="Maximum number of actions the agent will take per step", - ) - with gr.Column(): - use_vision = gr.Checkbox( - label="Use Vision", - value=config['use_vision'], - info="Enable visual processing capabilities", - ) - tool_calling_method = gr.Dropdown( - label="Tool Calling Method", - value=config['tool_calling_method'], - interactive=True, - allow_custom_value=True, # Allow users to input custom model names - choices=["auto", "json_schema", "function_calling"], - info="Tool Calls Funtion Name", - visible=False + with gr.Blocks(elem_id="main-container"): + with gr.Row(equal_height=True): + with gr.Column(scale=2, min_width=480, elem_id="left-column"): + with gr.Group(): + task = gr.Textbox( + label="Task Description", + lines=10, + placeholder="Enter your task here...", + value=config['task'], + info="Describe what you want the agent to do", ) - with gr.TabItem("🔧 LLM Configuration", id=2): - with gr.Group(): - llm_provider = gr.Dropdown( - choices=[provider for provider,model in utils.model_names.items()], - label="LLM Provider", - value=config['llm_provider'], - info="Select your preferred language model provider" - ) - llm_model_name = gr.Dropdown( - label="Model Name", - choices=utils.model_names['openai'], - value=config['llm_model_name'], - interactive=True, - allow_custom_value=True, # Allow users to input custom model names - info="Select a model from the dropdown or type a custom model name" - ) - llm_num_ctx = gr.Slider( - minimum=2**8, - maximum=2**16, - value=config['llm_num_ctx'], - step=1, - label="Max Context Length", - info="Controls max context length model needs to handle (less = faster)", - visible=config['llm_provider'] == "ollama" - ) - llm_temperature = gr.Slider( - minimum=0.0, - maximum=2.0, - value=config['llm_temperature'], - step=0.1, - label="Temperature", - info="Controls randomness in model outputs" - ) - with gr.Row(): - llm_base_url = gr.Textbox( - label="Base URL", - value=config['llm_base_url'], - info="API endpoint URL (if required)" + add_infos = gr.Textbox( + label="Additional Information", + lines=7, + placeholder="Add any helpful context or instructions...", + info="Optional hints to help the LLM complete the task", ) - llm_api_key = gr.Textbox( - label="API Key", - type="password", - value=config['llm_api_key'], - info="Your API key (leave blank to use .env)" - ) - - # Change event to update context length slider - def update_llm_num_ctx_visibility(llm_provider): - return gr.update(visible=llm_provider == "ollama") - - # Bind the change event of llm_provider to update the visibility of context length slider - llm_provider.change( - fn=update_llm_num_ctx_visibility, - inputs=llm_provider, - outputs=llm_num_ctx - ) - with gr.TabItem("🌐 Browser Settings", id=3): - with gr.Group(): with gr.Row(): - use_own_browser = gr.Checkbox( - label="Use Own Browser", - value=config['use_own_browser'], - info="Use your existing browser instance", - ) - keep_browser_open = gr.Checkbox( - label="Keep Browser Open", - value=config['keep_browser_open'], - info="Keep Browser Open between Tasks", - ) - headless = gr.Checkbox( - label="Headless Mode", - value=config['headless'], - info="Run browser without GUI", - ) - disable_security = gr.Checkbox( - label="Disable Security", - value=config['disable_security'], - info="Disable browser security features", - ) - enable_recording = gr.Checkbox( - label="Enable Recording", - value=config['enable_recording'], - info="Enable saving browser recordings", - ) + run_button = gr.Button("▶️ Run Agent", variant="primary", scale=1) + stop_button = gr.Button("⏹️ Stop", variant="stop", scale=1) - with gr.Row(): - window_w = gr.Number( - label="Window Width", - value=config['window_w'], - info="Browser window width", - ) - window_h = gr.Number( - label="Window Height", - value=config['window_h'], - info="Browser window height", - ) - - save_recording_path = gr.Textbox( - label="Recording Path", - placeholder="e.g. ./tmp/record_videos", - value=config['save_recording_path'], - info="Path to save browser recordings", - interactive=True, # Allow editing only if recording is enabled - ) - - save_trace_path = gr.Textbox( - label="Trace Path", - placeholder="e.g. ./tmp/traces", - value=config['save_trace_path'], - info="Path to save Agent traces", - interactive=True, - ) - - save_agent_history_path = gr.Textbox( - label="Agent History Save Path", - placeholder="e.g., ./tmp/agent_history", - value=config['save_agent_history_path'], - info="Specify the directory where agent history should be saved.", - interactive=True, - ) - - with gr.TabItem("🤖 Run Agent", id=4): - task = gr.Textbox( - label="Task Description", - lines=4, - placeholder="Enter your task here...", - value=config['task'], - info="Describe what you want the agent to do", - ) - add_infos = gr.Textbox( - label="Additional Information", - lines=3, - placeholder="Add any helpful context or instructions...", - info="Optional hints to help the LLM complete the task", - ) - - with gr.Row(): - run_button = gr.Button("▶️ Run Agent", variant="primary", scale=2) - stop_button = gr.Button("⏹️ Stop", variant="stop", scale=1) - - # with gr.Row(): - # browser_view = gr.HTML( - # value="

Waiting for browser session...

", - # label="Live Browser View", - # ) - - with gr.Row(): + with gr.Column(scale=3, min_width=720, elem_id="right-column"): gr.HTML( - """ + """ +
- """ - ) - - with gr.TabItem("🧐 Deep Research", id=5): - research_task_input = gr.Textbox(label="Research Task", lines=5, value="Compose a report on the use of Reinforcement Learning for training Large Language Models, encompassing its origins, current advancements, and future prospects, substantiated with examples of relevant models and techniques. The report should reflect original insights and analysis, moving beyond mere summarization of existing literature.") - with gr.Row(): - max_search_iteration_input = gr.Number(label="Max Search Iteration", value=3, precision=0) # precision=0 确保是整数 - max_query_per_iter_input = gr.Number(label="Max Query per Iteration", value=1, precision=0) # precision=0 确保是整数 - with gr.Row(): - research_button = gr.Button("▶️ Run Deep Research", variant="primary", scale=2) - stop_research_button = gr.Button("⏹️ Stop", variant="stop", scale=1) - markdown_output_display = gr.Markdown(label="Research Report") - markdown_download = gr.File(label="Download Research Report") +
+ """ + ) - with gr.TabItem("📊 Results", id=6): - with gr.Group(): + gr.HTML( + """ + + """ + ) - recording_display = gr.Video(label="Latest Recording") + with gr.Row(elem_id="settings-button-container"): + open_modal_button = gr.Button("⚙️", variant="secondary", elem_id="settings-button") - gr.Markdown("### Results") - with gr.Row(): - with gr.Column(): - final_result_output = gr.Textbox( - label="Final Result", lines=3, show_label=True + + # Modal Container (Initially Hidden) + with gr.Group(visible=False) as modal: + with gr.Blocks(css=".tab-container { min-width: 800px; }"): + with gr.Tabs() as tabs: + + with gr.TabItem("🌐 Browser Settings", id=1): + with gr.Group(): + with gr.Row(): + use_own_browser = gr.Checkbox( + label="Use Own Browser", + value=config['use_own_browser'], + info="Use your existing browser instance", + ) + keep_browser_open = gr.Checkbox( + label="Keep Browser Open", + value=config['keep_browser_open'], + info="Keep Browser Open between Tasks", + ) + headless = gr.Checkbox( + label="Headless Mode", + value=config['headless'], + info="Run browser without GUI", + ) + disable_security = gr.Checkbox( + label="Disable Security", + value=config['disable_security'], + info="Disable browser security features", + ) + enable_recording = gr.Checkbox( + label="Enable Recording", + value=config['enable_recording'], + info="Enable saving browser recordings", + ) + + with gr.Row(): + window_w = gr.Number( + label="Window Width", + value=config['window_w'], + info="Browser window width", + ) + window_h = gr.Number( + label="Window Height", + value=config['window_h'], + info="Browser window height", + ) + + save_recording_path = gr.Textbox( + label="Recording Path", + placeholder="e.g. ./tmp/record_videos", + value=config['save_recording_path'], + info="Path to save browser recordings", + interactive=True, # Allow editing only if recording is enabled + ) + + save_trace_path = gr.Textbox( + label="Trace Path", + placeholder="e.g. ./tmp/traces", + value=config['save_trace_path'], + info="Path to save Agent traces", + interactive=True, ) - with gr.Column(): - errors_output = gr.Textbox( - label="Errors", lines=3, show_label=True + + save_agent_history_path = gr.Textbox( + label="Agent History Save Path", + placeholder="e.g., ./tmp/agent_history", + value=config['save_agent_history_path'], + info="Specify the directory where agent history should be saved.", + interactive=True, ) - with gr.Row(): - with gr.Column(): - model_actions_output = gr.Textbox( - label="Model Actions", lines=3, show_label=True + + with gr.TabItem("⚙️ Agent Settings", id=2): + with gr.Group(): + agent_type = gr.Radio( + ["org", "custom"], + label="Agent Type", + value=config['agent_type'], + info="Select the type of agent to use", + ) + with gr.Column(): + max_steps = gr.Slider( + minimum=1, + maximum=200, + value=config['max_steps'], + step=1, + label="Max Run Steps", + info="Maximum number of steps the agent will take", + ) + max_actions_per_step = gr.Slider( + minimum=1, + maximum=20, + value=config['max_actions_per_step'], + step=1, + label="Max Actions per Step", + info="Maximum number of actions the agent will take per step", + ) + with gr.Column(): + use_vision = gr.Checkbox( + label="Use Vision", + value=config['use_vision'], + info="Enable visual processing capabilities", + ) + tool_calling_method = gr.Dropdown( + label="Tool Calling Method", + value=config['tool_calling_method'], + interactive=True, + allow_custom_value=True, # Allow users to input custom model names + choices=["auto", "json_schema", "function_calling"], + info="Tool Calls Funtion Name", + visible=False + ) + + with gr.TabItem("🔧 LLM Configuration", id=3): + with gr.Group(): + llm_provider = gr.Dropdown( + choices=[provider for provider,model in utils.model_names.items()], + label="LLM Provider", + value=config['llm_provider'], + info="Select your preferred language model provider" + ) + llm_model_name = gr.Dropdown( + label="Model Name", + choices=utils.model_names['openai'], + value=config['llm_model_name'], + interactive=True, + allow_custom_value=True, # Allow users to input custom model names + info="Select a model from the dropdown or type a custom model name" ) - with gr.Column(): - model_thoughts_output = gr.Textbox( - label="Model Thoughts", lines=3, show_label=True + llm_num_ctx = gr.Slider( + minimum=2**8, + maximum=2**16, + value=config['llm_num_ctx'], + step=1, + label="Max Context Length", + info="Controls max context length model needs to handle (less = faster)", + visible=config['llm_provider'] == "ollama" ) + llm_temperature = gr.Slider( + minimum=0.0, + maximum=2.0, + value=config['llm_temperature'], + step=0.1, + label="Temperature", + info="Controls randomness in model outputs" + ) + with gr.Row(): + llm_base_url = gr.Textbox( + label="Base URL", + value=config['llm_base_url'], + info="API endpoint URL (if required)" + ) + llm_api_key = gr.Textbox( + label="API Key", + type="password", + value=config['llm_api_key'], + info="Your API key (leave blank to use .env)" + ) + + # Change event to update context length slider + def update_llm_num_ctx_visibility(llm_provider): + return gr.update(visible=llm_provider == "ollama") + + # Bind the change event of llm_provider to update the visibility of context length slider + llm_provider.change( + fn=update_llm_num_ctx_visibility, + inputs=llm_provider, + outputs=llm_num_ctx + ) - trace_file = gr.File(label="Trace File") + + + # with gr.TabItem("🤖 Run Agent", id=4): + # task = gr.Textbox( + # label="Task Description", + # lines=4, + # placeholder="Enter your task here...", + # value=config['task'], + # info="Describe what you want the agent to do", + # ) + # add_infos = gr.Textbox( + # label="Additional Information", + # lines=3, + # placeholder="Add any helpful context or instructions...", + # info="Optional hints to help the LLM complete the task", + # ) + + # with gr.Row(): + # run_button = gr.Button("▶️ Run Agent", variant="primary", scale=2) + # stop_button = gr.Button("⏹️ Stop", variant="stop", scale=1) + + # # with gr.Row(): + # # browser_view = gr.HTML( + # # value="

Waiting for browser session...

", + # # label="Live Browser View", + # # ) + + # with gr.Row(): + # gr.HTML( + # """ + # + # """ + # ) + + with gr.TabItem("🧐 Deep Research", id=5): + research_task_input = gr.Textbox(label="Research Task", lines=5, value="Compose a report on the use of Reinforcement Learning for training Large Language Models, encompassing its origins, current advancements, and future prospects, substantiated with examples of relevant models and techniques. The report should reflect original insights and analysis, moving beyond mere summarization of existing literature.") + with gr.Row(): + max_search_iteration_input = gr.Number(label="Max Search Iteration", value=3, precision=0) # precision=0 确保是整数 + max_query_per_iter_input = gr.Number(label="Max Query per Iteration", value=1, precision=0) # precision=0 确保是整数 + with gr.Row(): + research_button = gr.Button("▶️ Run Deep Research", variant="primary", scale=2) + stop_research_button = gr.Button("⏹️ Stop", variant="stop", scale=1) + markdown_output_display = gr.Markdown(label="Research Report") + markdown_download = gr.File(label="Download Research Report") + + + with gr.TabItem("📊 Results", id=6): + with gr.Group(): + + recording_display = gr.Video(label="Latest Recording") + + gr.Markdown("### Results") + with gr.Row(): + with gr.Column(): + final_result_output = gr.Textbox( + label="Final Result", lines=3, show_label=True + ) + with gr.Column(): + errors_output = gr.Textbox( + label="Errors", lines=3, show_label=True + ) + with gr.Row(): + with gr.Column(): + model_actions_output = gr.Textbox( + label="Model Actions", lines=3, show_label=True + ) + with gr.Column(): + model_thoughts_output = gr.Textbox( + label="Model Thoughts", lines=3, show_label=True + ) + + trace_file = gr.File(label="Trace File") + + agent_history_file = gr.File(label="Agent History") + + # # Bind the stop button click event after errors_output is defined + # stop_button.click( + # fn=stop_agent, + # inputs=[], + # outputs=[errors_output, stop_button, run_button], + # ) + + # # Run button click handler + # run_button.click( + # fn=run_with_stream, + # inputs=[ + # agent_type, llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, + # use_own_browser, keep_browser_open, headless, disable_security, window_w, window_h, + # save_recording_path, save_agent_history_path, save_trace_path, # Include the new path + # enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step, tool_calling_method + # ], + # outputs=[ + # # browser_view, # Browser view + # final_result_output, # Final result + # errors_output, # Errors + # model_actions_output, # Model actions + # model_thoughts_output, # Model thoughts + # recording_display, # Latest recording + # trace_file, # Trace file + # agent_history_file, # Agent history file + # stop_button, # Stop button + # run_button # Run button + # ], + # ) + + # Run Deep Research + research_button.click( + fn=run_deep_search, + inputs=[research_task_input, max_search_iteration_input, max_query_per_iter_input, llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision, use_own_browser, headless], + outputs=[markdown_output_display, markdown_download, stop_research_button, research_button] + ) + # Bind the stop button click event after errors_output is defined + stop_research_button.click( + fn=stop_research_agent, + inputs=[], + outputs=[stop_research_button, research_button], + ) - agent_history_file = gr.File(label="Agent History") + with gr.TabItem("🎥 Recordings", id=7): + def list_recordings(save_recording_path): + if not os.path.exists(save_recording_path): + return [] - # Bind the stop button click event after errors_output is defined - stop_button.click( - fn=stop_agent, - inputs=[], - outputs=[errors_output, stop_button, run_button], - ) + # Get all video files + recordings = glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]")) - # Run button click handler - run_button.click( - fn=run_with_stream, - inputs=[ - agent_type, llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, - use_own_browser, keep_browser_open, headless, disable_security, window_w, window_h, - save_recording_path, save_agent_history_path, save_trace_path, # Include the new path - enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step, tool_calling_method - ], - outputs=[ - # browser_view, # Browser view - final_result_output, # Final result - errors_output, # Errors - model_actions_output, # Model actions - model_thoughts_output, # Model thoughts - recording_display, # Latest recording - trace_file, # Trace file - agent_history_file, # Agent history file - stop_button, # Stop button - run_button # Run button - ], - ) - - # Run Deep Research - research_button.click( - fn=run_deep_search, - inputs=[research_task_input, max_search_iteration_input, max_query_per_iter_input, llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision, use_own_browser, headless], - outputs=[markdown_output_display, markdown_download, stop_research_button, research_button] - ) - # Bind the stop button click event after errors_output is defined - stop_research_button.click( - fn=stop_research_agent, - inputs=[], - outputs=[stop_research_button, research_button], - ) + # Sort recordings by creation time (oldest first) + recordings.sort(key=os.path.getctime) - with gr.TabItem("🎥 Recordings", id=7): - def list_recordings(save_recording_path): - if not os.path.exists(save_recording_path): - return [] + # Add numbering to the recordings + numbered_recordings = [] + for idx, recording in enumerate(recordings, start=1): + filename = os.path.basename(recording) + numbered_recordings.append((recording, f"{idx}. {filename}")) - # Get all video files - recordings = glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]")) + return numbered_recordings - # Sort recordings by creation time (oldest first) - recordings.sort(key=os.path.getctime) + recordings_gallery = gr.Gallery( + label="Recordings", + value=list_recordings(config['save_recording_path']), + columns=3, + height="auto", + object_fit="contain" + ) - # Add numbering to the recordings - numbered_recordings = [] - for idx, recording in enumerate(recordings, start=1): - filename = os.path.basename(recording) - numbered_recordings.append((recording, f"{idx}. {filename}")) + refresh_button = gr.Button("🔄 Refresh Recordings", variant="secondary") + refresh_button.click( + fn=list_recordings, + inputs=save_recording_path, + outputs=recordings_gallery + ) + + with gr.TabItem("📁 Configuration", id=8): + with gr.Group(): + config_file_input = gr.File( + label="Load Config File", + file_types=[".pkl"], + interactive=True + ) - return numbered_recordings + load_config_button = gr.Button("Load Existing Config From File", variant="primary") + save_config_button = gr.Button("Save Current Config", variant="primary") - recordings_gallery = gr.Gallery( - label="Recordings", - value=list_recordings(config['save_recording_path']), - columns=3, - height="auto", - object_fit="contain" - ) + config_status = gr.Textbox( + label="Status", + lines=2, + interactive=False + ) - refresh_button = gr.Button("🔄 Refresh Recordings", variant="secondary") - refresh_button.click( - fn=list_recordings, - inputs=save_recording_path, - outputs=recordings_gallery - ) - - with gr.TabItem("📁 Configuration", id=8): - with gr.Group(): - config_file_input = gr.File( - label="Load Config File", - file_types=[".pkl"], - interactive=True - ) + load_config_button.click( + fn=update_ui_from_config, + inputs=[config_file_input], + outputs=[ + agent_type, max_steps, max_actions_per_step, use_vision, tool_calling_method, + llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, + use_own_browser, keep_browser_open, headless, disable_security, enable_recording, + window_w, window_h, save_recording_path, save_trace_path, save_agent_history_path, + config_status + ] + ) - load_config_button = gr.Button("Load Existing Config From File", variant="primary") - save_config_button = gr.Button("Save Current Config", variant="primary") + save_config_button.click( + fn=save_current_config, + inputs=[ + agent_type, max_steps, max_actions_per_step, use_vision, tool_calling_method, + llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, + use_own_browser, keep_browser_open, headless, disable_security, + enable_recording, window_w, window_h, save_recording_path, save_trace_path, + save_agent_history_path, + ], + outputs=[config_status] + ) - config_status = gr.Textbox( - label="Status", - lines=2, - interactive=False - ) - load_config_button.click( - fn=update_ui_from_config, - inputs=[config_file_input], - outputs=[ - agent_type, max_steps, max_actions_per_step, use_vision, tool_calling_method, - llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, - use_own_browser, keep_browser_open, headless, disable_security, enable_recording, - window_w, window_h, save_recording_path, save_trace_path, save_agent_history_path, - task, config_status - ] - ) + # Close Button + close_modal_button = gr.Button("❌ Close", variant="stop") - save_config_button.click( - fn=save_current_config, - inputs=[ - agent_type, max_steps, max_actions_per_step, use_vision, tool_calling_method, - llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, - use_own_browser, keep_browser_open, headless, disable_security, - enable_recording, window_w, window_h, save_recording_path, save_trace_path, - save_agent_history_path, task, - ], - outputs=[config_status] - ) + # Bind Events + open_modal_button.click(fn=lambda: gr.update(visible=True), inputs=[], outputs=modal) + + close_modal_button.click(fn=close_modal, inputs=[], outputs=modal) + + + + + + # Run button click handler + run_button.click( + fn=run_with_stream, + inputs=[ + agent_type, llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, + use_own_browser, keep_browser_open, headless, disable_security, window_w, window_h, + save_recording_path, save_agent_history_path, save_trace_path, # Include the new path + enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step, tool_calling_method + ], + outputs=[ + final_result_output, errors_output, model_actions_output, model_thoughts_output, + recording_display, trace_file, agent_history_file, stop_button, run_button + ], + ) + + stop_button.click( + fn=stop_agent, + inputs=[], + outputs=[errors_output, stop_button, run_button], + ) + + # with gr.Tabs() as tabs: + # with gr.TabItem("⚙️ Agent Settings", id=1): + # with gr.Group(): + # agent_type = gr.Radio( + # ["org", "custom"], + # label="Agent Type", + # value=config['agent_type'], + # info="Select the type of agent to use", + # ) + # with gr.Column(): + # max_steps = gr.Slider( + # minimum=1, + # maximum=200, + # value=config['max_steps'], + # step=1, + # label="Max Run Steps", + # info="Maximum number of steps the agent will take", + # ) + # max_actions_per_step = gr.Slider( + # minimum=1, + # maximum=20, + # value=config['max_actions_per_step'], + # step=1, + # label="Max Actions per Step", + # info="Maximum number of actions the agent will take per step", + # ) + # with gr.Column(): + # use_vision = gr.Checkbox( + # label="Use Vision", + # value=config['use_vision'], + # info="Enable visual processing capabilities", + # ) + # tool_calling_method = gr.Dropdown( + # label="Tool Calling Method", + # value=config['tool_calling_method'], + # interactive=True, + # allow_custom_value=True, # Allow users to input custom model names + # choices=["auto", "json_schema", "function_calling"], + # info="Tool Calls Funtion Name", + # visible=False + # ) + + # with gr.TabItem("🔧 LLM Configuration", id=2): + # with gr.Group(): + # llm_provider = gr.Dropdown( + # choices=[provider for provider,model in utils.model_names.items()], + # label="LLM Provider", + # value=config['llm_provider'], + # info="Select your preferred language model provider" + # ) + # llm_model_name = gr.Dropdown( + # label="Model Name", + # choices=utils.model_names['openai'], + # value=config['llm_model_name'], + # interactive=True, + # allow_custom_value=True, # Allow users to input custom model names + # info="Select a model from the dropdown or type a custom model name" + # ) + # llm_num_ctx = gr.Slider( + # minimum=2**8, + # maximum=2**16, + # value=config['llm_num_ctx'], + # step=1, + # label="Max Context Length", + # info="Controls max context length model needs to handle (less = faster)", + # visible=config['llm_provider'] == "ollama" + # ) + # llm_temperature = gr.Slider( + # minimum=0.0, + # maximum=2.0, + # value=config['llm_temperature'], + # step=0.1, + # label="Temperature", + # info="Controls randomness in model outputs" + # ) + # with gr.Row(): + # llm_base_url = gr.Textbox( + # label="Base URL", + # value=config['llm_base_url'], + # info="API endpoint URL (if required)" + # ) + # llm_api_key = gr.Textbox( + # label="API Key", + # type="password", + # value=config['llm_api_key'], + # info="Your API key (leave blank to use .env)" + # ) + + # # Change event to update context length slider + # def update_llm_num_ctx_visibility(llm_provider): + # return gr.update(visible=llm_provider == "ollama") + + # # Bind the change event of llm_provider to update the visibility of context length slider + # llm_provider.change( + # fn=update_llm_num_ctx_visibility, + # inputs=llm_provider, + # outputs=llm_num_ctx + # ) + + # with gr.TabItem("🌐 Browser Settings", id=3): + # with gr.Group(): + # with gr.Row(): + # use_own_browser = gr.Checkbox( + # label="Use Own Browser", + # value=config['use_own_browser'], + # info="Use your existing browser instance", + # ) + # keep_browser_open = gr.Checkbox( + # label="Keep Browser Open", + # value=config['keep_browser_open'], + # info="Keep Browser Open between Tasks", + # ) + # headless = gr.Checkbox( + # label="Headless Mode", + # value=config['headless'], + # info="Run browser without GUI", + # ) + # disable_security = gr.Checkbox( + # label="Disable Security", + # value=config['disable_security'], + # info="Disable browser security features", + # ) + # enable_recording = gr.Checkbox( + # label="Enable Recording", + # value=config['enable_recording'], + # info="Enable saving browser recordings", + # ) + + # with gr.Row(): + # window_w = gr.Number( + # label="Window Width", + # value=config['window_w'], + # info="Browser window width", + # ) + # window_h = gr.Number( + # label="Window Height", + # value=config['window_h'], + # info="Browser window height", + # ) + + # save_recording_path = gr.Textbox( + # label="Recording Path", + # placeholder="e.g. ./tmp/record_videos", + # value=config['save_recording_path'], + # info="Path to save browser recordings", + # interactive=True, # Allow editing only if recording is enabled + # ) + + # save_trace_path = gr.Textbox( + # label="Trace Path", + # placeholder="e.g. ./tmp/traces", + # value=config['save_trace_path'], + # info="Path to save Agent traces", + # interactive=True, + # ) + + # save_agent_history_path = gr.Textbox( + # label="Agent History Save Path", + # placeholder="e.g., ./tmp/agent_history", + # value=config['save_agent_history_path'], + # info="Specify the directory where agent history should be saved.", + # interactive=True, + # ) + + # with gr.TabItem("🤖 Run Agent", id=4): + # task = gr.Textbox( + # label="Task Description", + # lines=4, + # placeholder="Enter your task here...", + # value=config['task'], + # info="Describe what you want the agent to do", + # ) + # add_infos = gr.Textbox( + # label="Additional Information", + # lines=3, + # placeholder="Add any helpful context or instructions...", + # info="Optional hints to help the LLM complete the task", + # ) + + # with gr.Row(): + # run_button = gr.Button("▶️ Run Agent", variant="primary", scale=2) + # stop_button = gr.Button("⏹️ Stop", variant="stop", scale=1) + + # # with gr.Row(): + # # browser_view = gr.HTML( + # # value="

Waiting for browser session...

", + # # label="Live Browser View", + # # ) + + # with gr.Row(): + # gr.HTML( + # """ + # + # """ + # ) + + # with gr.TabItem("🧐 Deep Research", id=5): + # research_task_input = gr.Textbox(label="Research Task", lines=5, value="Compose a report on the use of Reinforcement Learning for training Large Language Models, encompassing its origins, current advancements, and future prospects, substantiated with examples of relevant models and techniques. The report should reflect original insights and analysis, moving beyond mere summarization of existing literature.") + # with gr.Row(): + # max_search_iteration_input = gr.Number(label="Max Search Iteration", value=3, precision=0) # precision=0 确保是整数 + # max_query_per_iter_input = gr.Number(label="Max Query per Iteration", value=1, precision=0) # precision=0 确保是整数 + # with gr.Row(): + # research_button = gr.Button("▶️ Run Deep Research", variant="primary", scale=2) + # stop_research_button = gr.Button("⏹️ Stop", variant="stop", scale=1) + # markdown_output_display = gr.Markdown(label="Research Report") + # markdown_download = gr.File(label="Download Research Report") + + + # with gr.TabItem("📊 Results", id=6): + # with gr.Group(): + + # recording_display = gr.Video(label="Latest Recording") + + # gr.Markdown("### Results") + # with gr.Row(): + # with gr.Column(): + # final_result_output = gr.Textbox( + # label="Final Result", lines=3, show_label=True + # ) + # with gr.Column(): + # errors_output = gr.Textbox( + # label="Errors", lines=3, show_label=True + # ) + # with gr.Row(): + # with gr.Column(): + # model_actions_output = gr.Textbox( + # label="Model Actions", lines=3, show_label=True + # ) + # with gr.Column(): + # model_thoughts_output = gr.Textbox( + # label="Model Thoughts", lines=3, show_label=True + # ) + + # trace_file = gr.File(label="Trace File") + + # agent_history_file = gr.File(label="Agent History") + + # # Bind the stop button click event after errors_output is defined + # stop_button.click( + # fn=stop_agent, + # inputs=[], + # outputs=[errors_output, stop_button, run_button], + # ) + + # # Run button click handler + # run_button.click( + # fn=run_with_stream, + # inputs=[ + # agent_type, llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, + # use_own_browser, keep_browser_open, headless, disable_security, window_w, window_h, + # save_recording_path, save_agent_history_path, save_trace_path, # Include the new path + # enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step, tool_calling_method + # ], + # outputs=[ + # # browser_view, # Browser view + # final_result_output, # Final result + # errors_output, # Errors + # model_actions_output, # Model actions + # model_thoughts_output, # Model thoughts + # recording_display, # Latest recording + # trace_file, # Trace file + # agent_history_file, # Agent history file + # stop_button, # Stop button + # run_button # Run button + # ], + # ) + + # # Run Deep Research + # research_button.click( + # fn=run_deep_search, + # inputs=[research_task_input, max_search_iteration_input, max_query_per_iter_input, llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, use_vision, use_own_browser, headless], + # outputs=[markdown_output_display, markdown_download, stop_research_button, research_button] + # ) + # # Bind the stop button click event after errors_output is defined + # stop_research_button.click( + # fn=stop_research_agent, + # inputs=[], + # outputs=[stop_research_button, research_button], + # ) + + # with gr.TabItem("🎥 Recordings", id=7): + # def list_recordings(save_recording_path): + # if not os.path.exists(save_recording_path): + # return [] + + # # Get all video files + # recordings = glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]")) + + # # Sort recordings by creation time (oldest first) + # recordings.sort(key=os.path.getctime) + + # # Add numbering to the recordings + # numbered_recordings = [] + # for idx, recording in enumerate(recordings, start=1): + # filename = os.path.basename(recording) + # numbered_recordings.append((recording, f"{idx}. {filename}")) + + # return numbered_recordings + + # recordings_gallery = gr.Gallery( + # label="Recordings", + # value=list_recordings(config['save_recording_path']), + # columns=3, + # height="auto", + # object_fit="contain" + # ) + + # refresh_button = gr.Button("🔄 Refresh Recordings", variant="secondary") + # refresh_button.click( + # fn=list_recordings, + # inputs=save_recording_path, + # outputs=recordings_gallery + # ) + + # with gr.TabItem("📁 Configuration", id=8): + # with gr.Group(): + # config_file_input = gr.File( + # label="Load Config File", + # file_types=[".pkl"], + # interactive=True + # ) + + # load_config_button = gr.Button("Load Existing Config From File", variant="primary") + # save_config_button = gr.Button("Save Current Config", variant="primary") + + # config_status = gr.Textbox( + # label="Status", + # lines=2, + # interactive=False + # ) + + # load_config_button.click( + # fn=update_ui_from_config, + # inputs=[config_file_input], + # outputs=[ + # agent_type, max_steps, max_actions_per_step, use_vision, tool_calling_method, + # llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, + # use_own_browser, keep_browser_open, headless, disable_security, enable_recording, + # window_w, window_h, save_recording_path, save_trace_path, save_agent_history_path, + # task, config_status + # ] + # ) + + # save_config_button.click( + # fn=save_current_config, + # inputs=[ + # agent_type, max_steps, max_actions_per_step, use_vision, tool_calling_method, + # llm_provider, llm_model_name, llm_num_ctx, llm_temperature, llm_base_url, llm_api_key, + # use_own_browser, keep_browser_open, headless, disable_security, + # enable_recording, window_w, window_h, save_recording_path, save_trace_path, + # save_agent_history_path, task, + # ], + # outputs=[config_status] + # ) # Attach the callback to the LLM provider dropdown From 21404f011a0372ff650ea5c1172e405bd9d3eadb Mon Sep 17 00:00:00 2001 From: prathamxcaliber Date: Sun, 2 Mar 2025 22:21:58 +0530 Subject: [PATCH 7/9] feat: add custom_theme for the app --- custom_theme.py | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ webui.py | 62 ++++++++++++++++++++++++----------------- 2 files changed, 110 insertions(+), 25 deletions(-) create mode 100644 custom_theme.py diff --git a/custom_theme.py b/custom_theme.py new file mode 100644 index 00000000..80c25394 --- /dev/null +++ b/custom_theme.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +from collections.abc import Iterable + +from gradio.themes.base import Base +from gradio.themes.utils import colors, fonts, sizes + + +class custom_theme(Base): + def __init__( + self, + *, + primary_hue: colors.Color | str = colors.blue, + secondary_hue: colors.Color | str = colors.sky, + neutral_hue: colors.Color | str = colors.gray, + spacing_size: sizes.Size | str = sizes.spacing_md, + radius_size: sizes.Size | str = sizes.radius_lg, + text_size: sizes.Size | str = sizes.text_md, + font: fonts.Font | str | Iterable[fonts.Font | str] = ( + fonts.GoogleFont("Montserrat"), + "ui-sans-serif", + "system-ui", + "sans-serif", + ), + font_mono: fonts.Font | str | Iterable[fonts.Font | str] = ( + fonts.GoogleFont("Inter"), + "ui-monospace", + "Consolas", + "monospace", + ), + ): + super().__init__( + primary_hue=primary_hue, + secondary_hue=secondary_hue, + neutral_hue=neutral_hue, + spacing_size=spacing_size, + radius_size=radius_size, + text_size=text_size, + font=font, + font_mono=font_mono, + ) + self.name = "custom_theme" + super().set( + button_border_width="0px", + checkbox_label_border_width="1px", + button_transform_hover="scale(1.02)", + button_transition="all 0.1s ease-in-out", + slider_color="*primary_400", + button_primary_background_fill="linear-gradient(120deg, *secondary_500 0%, *primary_300 60%, *primary_400 100%)", + button_primary_background_fill_hover="linear-gradient(120deg, *secondary_400 0%, *primary_300 60%, *primary_300 100%)", + button_primary_text_color="*button_secondary_text_color", + button_secondary_background_fill="linear-gradient(120deg, *neutral_300 0%, *neutral_100 60%, *neutral_200 100%)", + button_secondary_background_fill_hover="linear-gradient(120deg, *neutral_200 0%, *neutral_100 60%, *neutral_100 100%)", + checkbox_label_background_fill_selected="linear-gradient(120deg, *primary_400 0%, *primary_300 60%, *primary_400 100%)", + checkbox_label_border_color_selected="*primary_400", + checkbox_background_color_selected="*primary_400", + checkbox_label_text_color_selected="*button_secondary_text_color", + slider_color_dark="*primary_500", + button_primary_background_fill_dark="linear-gradient(120deg, *secondary_600 0%, *primary_500 60%, *primary_600 100%)", + button_primary_background_fill_hover_dark="linear-gradient(120deg, *secondary_500 0%, *primary_500 60%, *primary_500 100%)", + button_primary_text_color_dark="*button_secondary_text_color", + button_secondary_background_fill_dark="linear-gradient(120deg, *neutral_700 0%, *neutral_600 60%, *neutral_700 100%)", + button_secondary_background_fill_hover_dark="linear-gradient(120deg, *neutral_600 0%, *neutral_600 60%, *neutral_700 100%)", + checkbox_label_background_fill_selected_dark="linear-gradient(120deg, *primary_600 0%, *primary_500 60%, *primary_600 100%)", + checkbox_label_border_color_selected_dark="*primary_600", + checkbox_background_color_selected_dark="*primary_600", + checkbox_label_text_color_selected_dark="*button_secondary_text_color", + block_shadow="*shadow_drop_lg", + button_secondary_shadow_hover="*shadow_drop_lg", + button_primary_shadow_hover="0 1px 3px 0 *primary_200, 0 1px 2px -1px *primary_200", + button_secondary_shadow_dark="none", + button_primary_shadow_dark="none", + ) diff --git a/webui.py b/webui.py index 2ad1c397..51385473 100644 --- a/webui.py +++ b/webui.py @@ -32,6 +32,7 @@ from src.browser.custom_context import BrowserContextConfig, CustomBrowserContext from src.controller.custom_controller import CustomController from gradio.themes import Citrus, Default, Glass, Monochrome, Ocean, Origin, Soft, Base +from custom_theme import custom_theme from src.utils.default_config_settings import default_config, load_config_from_file, save_config_to_file, save_current_config, update_ui_from_config from src.utils.utils import update_model_dropdown, get_latest_files, capture_screenshot @@ -74,6 +75,9 @@ def open_modal(): def close_modal(): return gr.update(visible=False) +def show_iframe(): + return gr.update(visible=True) + async def stop_agent(): """Request the agent to stop and update UI with enhanced feedback""" global _global_agent_state, _global_browser_context, _global_browser, _global_agent @@ -650,7 +654,8 @@ async def run_with_stream( "Origin": Origin(), "Citrus": Citrus(), "Ocean": Ocean(), - "Base": Base() + "Base": Base(), + "custom_theme": custom_theme() } async def close_global_browser(): @@ -690,7 +695,7 @@ async def run_deep_search(research_task, max_search_iteration_input, max_query_p return markdown_content, file_path, gr.update(value="Stop", interactive=True), gr.update(interactive=True) -def create_ui(config, theme_name="Ocean"): +def create_ui(config, theme_name="custom_theme"): css = """ .gradio-container { max-width: 1200px !important; @@ -711,22 +716,25 @@ def create_ui(config, theme_name="Ocean"): with gr.Blocks( title="EHR Operator", theme=theme_map[theme_name], css="body { display: flex; justify-content: center; } #main-container { max-width: 1200px; width: 100%; }" ) as demo: + with gr.Row(): gr.Markdown( """

🌐 EHR Operator

-

Control your browser with AI assistance

+

Control your EHR via prompts

""", elem_classes=["header-text"], ) - with gr.Blocks(elem_id="main-container"): - with gr.Row(equal_height=True): - with gr.Column(scale=2, min_width=480, elem_id="left-column"): + + with gr.Blocks(elem_id="main-container"): + # Main Row (Contains left-column and right-column iframe) + with gr.Row(equal_height=True): + with gr.Column(scale=2, min_width=480, elem_id="left-column"): with gr.Group(): task = gr.Textbox( label="Task Description", - lines=10, + lines=10, placeholder="Enter your task here...", value=config['task'], info="Describe what you want the agent to do", @@ -740,25 +748,29 @@ def create_ui(config, theme_name="Ocean"): ) with gr.Row(): - run_button = gr.Button("▶️ Run Agent", variant="primary", scale=1) - stop_button = gr.Button("⏹️ Stop", variant="stop", scale=1) + run_button = gr.Button("Run Agent", variant="primary", scale=1) + stop_button = gr.Button("Stop", variant="stop", scale=1) - with gr.Column(scale=3, min_width=720, elem_id="right-column"): + # Initially hidden iframe column inside the SAME Row + with gr.Column(scale=3, min_width=720, elem_id="right-column", visible=False) as iframe_row: gr.HTML( - """ -
- -
- """ - ) + """ +
+ +
+ """ + ) + + # Button click will now reveal the iframe in the SAME row + run_button.click(show_iframe, outputs=iframe_row) gr.HTML( @@ -1574,7 +1586,7 @@ def main(): parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent") parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to") parser.add_argument("--port", type=int, default=7788, help="Port to listen on") - parser.add_argument("--theme", type=str, default="Ocean", choices=theme_map.keys(), help="Theme to use for the UI") + parser.add_argument("--theme", type=str, default="custom_theme", choices=theme_map.keys(), help="Theme to use for the UI") parser.add_argument("--dark-mode", action="store_true", help="Enable dark mode") args = parser.parse_args() From cae287f09ee6c5b35f4ff093b79026eeb1f009fe Mon Sep 17 00:00:00 2001 From: prathamxcaliber Date: Sun, 2 Mar 2025 22:57:05 +0530 Subject: [PATCH 8/9] feat: add XC logo to favicon --- logo.png | Bin 0 -> 2559 bytes webui.py | 8 ++++++-- 2 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 logo.png diff --git a/logo.png b/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..99ebb6fcdda35efa01aeb1606a8dd8004e725f4f GIT binary patch literal 2559 zcmV|wqP))dlQxzom)Oe&-kX>2-U#imxRNfERlzQiA}A_&P#{R1C_wiJA=g3pq-zKDoL zv?!9Qtp$ZlD~JP1lNNuHjCRDSW9>{rlFm$K?%jKJ@AbF$nLBOhA$RUQ=j@-g*ZS64 z-&#RP*JJ;`_HYdG5hA!90w6^KAS8(C4>bfKrtdT`q{js5IbiyvoAfWFKarpF92!OR zxp@qaLm8LJIe83GC4?Tv1Pax?u1|bOB&-jn^G#tij2~&zugAeok~Tro!Mu6a zOuonbO5Ub>OiB|WWt?hRHm`-0-?oOHG7Dw%FwcN-A*s@QQc434x9R3kWg0o5Y9KvF znWPh{g=+Jwu>6eF>}D0>gB6o!p^{1Uu;6RfP|!mOCL&wvP{W#0{J1Gi!zv`QP$(Ol z)+%kQImzZ3b~8jU0(Q-r)+1hv=TPfLN)RJ*q|6u{suPwu6^7U zb|TRv)-ld8mVCk%quN7R1N@xVK&mXT%@R4WNyT#uZ-ZCmxj1{EsDUg#Br49x%iqKI%D(4f7qh& zT+hz!F*M838ejBp;CzpgW2uUPgg;A|eC)$Xu(p_W-)@8xclVR2n;p%~0{(SKYnw60 zeo?WE!HCYeeV&8=V#ky+mbOnyDhVT#d8q%;FudmO-59+W~NXIneX} z8NrRN)ir`=UfvQip0yLG!=f-yp^zRLWlSb5O0qF~e~ju(+nKfIY>SO(gxQGla|@#s z^yAwKI(K1c0`LLZkJV{4#~T*&+QoY8R8$gB8lD}gA$g&uR(4c`0ylldbT$(HfAE*B zw-xl#8j+=0FKbL0ft+cXGHPyXwOI_}{V}jcvBbfwb|Jia>b!}43SrLgm#z@~GC9dx z3A($dn`?Zv(nS>_zk|n5i=ljYYr%!5OrS~?B~ZNtUb+`^}WekHGyGbXA?bQg*?rx9+5enMwRfq7=h``JO$;yIOm`x6s*^kB={`N^NR zU}Fcx-ha3!VN5qRQm)m5o0|Mn6eeS{6-#N;7zx%GR_K+53^f&3rRmw%KPox=iz#+a zO#rUGLGa}KI9><45WaMDS7G~IhpkEx`3OZ6r-E2Wjuv;&C}VMtj9%VqIt#@l$7foc zzNHaC`UofQ+r?IRX=wy!&X0$z`|oVw(a#KU>ZZo(E>^;1AMmrs`tZ%q52|!otj87< zM-nK4uqVy{SY@Ev1Of$wnW-KfpY3;|`b^60$8K+7{$Gi%o!f$YrW=^Lx#-pNo8YS{?r6M^}9_GD3p2(vO%pn4!O9i18=C*?ZRThCE&>q z^~X}rY9hcFUz3!$kr>da=dQ#0*3N+-3wLkgr?b;N7cyU;PEeXX*hu+8)MfjUt&}}c zsUVG0N}MYt^T~S?E+?C@&7YhfQWBPod95avb?VvcOg7%h_0`V}z*|a6?Xu|=tLjWz zRZ2M1KeKR(;y%>Lfk;>?1UM_zy#d02Wq!uNlDozA^-<20ME%te2n4y>68r5L2b z;PP$A7^Nx?i$>Av_31r;$KT)YEWC6%!kK>z;ZOg}DYD_iIoT#@gk8Eq_(am|%z*|z z{((NsBy;b7SJxuv|2={ilLP+o#SyJd>qpvRcC-7iLhA zhvX1`S}W*0#Ca@6kC|M-rtd+&p+R3Nh}SPi?cS*tg)nnH6OGlEo^kY$r7lD#sA~X05HQ@j*)S%WHQ&Txn>O^}MmsLiF|vT3Rbj z0xSop(wQ|SG$~-TciwChZI#Ykhtd@S;-zsXz?+xhTeB%gpWT{JzB2+PRA=uOjQ>_75_$RnVtc&cp)l=qj!q`nQEn?Mp6~VNL zAZtUXo$h;(@mwt2H_AjZE+7_$7L6&@Z5Ds#(Ey+lge+eZSlvw+J568JR^=R21nQW# zSiqc6IfnRx{cdBHIe@oj6dQFjiwCOKq0$PLI$Rk$S(B{BVud{?KU8=7w1|@fqB@lw z#$;%X6#-SvRwX2*mZ`#*h7xl=O>xNH9BUzq`LwjaI*3cZs)`bVO*G9*sFK5FB{gZ& zxpHFglBClH?>GXv7rPS#*D4lxlz0eL2rPKC(x)g2yuH4yjtjKd0QSDq;iT#$%ojVY za1$7HGc1^M%~P>VX$XVWKCCG%EzIoN88N2jP{?%?iWL^)u=y_2n1`NNJ_Nnq