Visual Understanding
Capture screenshots for LLM vision models. Let AI see and understand the desktop.
Give LLMs the ability to see and interact with desktop applications. Build autonomous agents that can complete complex tasks across any software.
import {
screen, mouse, Button, keyboard, straightTo
} from "@nut-tree/nut-js";
import Anthropic from "@anthropic-ai/sdk";
async function runDesktopAgent(task: string) {
const anthropic = new Anthropic({
apiKey: process.env['ANTHROPIC_API_KEY'],
});
while (true) {
// Capture what the agent "sees"
const screenshot = await screen.grab();
const base64Image = await screenshot.toDataURL(FileType.PNG);
// Ask the LLM what to do
const response = await anthropic.messages.create({
model: "claude-sonnet-4-5-20250929",
messages: [{
role: "user",
content: [
{ type: "image", source: { type: "base64", data: base64Image } },
{ type: "text", text: `Task: ${task}\nWhat action should I take?` }
]
}],
});
// Execute the action
await executeAction(response.content);
if (isTaskComplete(response)) break;
}
}nut.js provides everything you need to enable AI agents to interact with desktop applications.
Capture screenshots for LLM vision models. Let AI see and understand the desktop.
Find elements by UI structure, images, text, or colors. Flexible strategies for any app.
Extract text from screen for enhanced context. Help AI understand UI elements.
Build agents that work on Windows, macOS, and Linux without code changes.
Connect your LLM's output to precise desktop interactions. nut.js handles the execution while your AI handles the reasoning.
Coordinate-based clicking
LLM outputs coordinates, nut.js executes clicks
Natural text input
Type text with configurable speed and timing
Complex interactions
Drag and drop, right-click menus, keyboard shortcuts
import {
mouse, Button, keyboard, straightTo, sleep
} from "@nut-tree/nut-js";
interface AgentAction {
type: "click" | "type" | "scroll" | "wait";
x?: number;
y?: number;
text?: string;
direction?: "up" | "down";
amount?: number;
}
async function executeAction(action: AgentAction) {
switch (action.type) {
case "click":
await mouse.move(
straightTo({ x: action.x!, y: action.y! })
);
await mouse.click(Button.LEFT);
break;
case "type":
await keyboard.type(action.text!);
break;
case "scroll":
if (action.direction === "up") {
await mouse.scrollUp(action.amount ?? 3);
} else {
await mouse.scrollDown(action.amount ?? 3);
}
break;
case "wait":
await sleep(1000);
break;
}
}import { screen, getActiveWindow } from "@nut-tree/nut-js";
async function getScreenContext() {
// Capture the screen
const screenshot = await screen.grab();
// Get active window information
const activeWindow = await getActiveWindow();
return {
screenshot: screenshot.data.toString("base64"),
activeApp: await activeWindow.getTitle(),
windowRegion: await activeWindow.getRegion(),
screenSize: {
width: screen.width,
height: screen.height
},
};
}
// Provide rich context to your LLM
async function buildPrompt(task: string) {
const context = await getScreenContext();
return `
Current screen shows: ${context.activeApp}
Screen size: ${context.screenSize.width}x${context.screenSize.height}
Task: ${task}
`;
}Combine screenshots with OCR text extraction, window information, and screen metadata to help your LLM make better decisions.
nut.js is flexible enough to support various agent architectures and AI models.
Use GPT-4 Vision, Claude, or Gemini to interpret screenshots and decide actions.
Combine LLM reasoning with traditional image recognition for reliable automation.
Build agents that can plan, execute, and verify multi-step tasks independently.
Build AI assistants that can operate any desktop application on behalf of users.
Create intelligent RPA that adapts to UI changes and handles exceptions.
Build AI-powered testers that explore applications and find issues autonomously.
Extract data from any application by teaching AI what to look for.
Get started with nut.js and give your AI the power to use any desktop application.