From a4dd5f87c0c74cda79554d6a551e750fca0114df Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Fri, 24 Apr 2026 13:36:07 +0530 Subject: [PATCH 1/3] feat: add support for document extraction --- client.py | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/client.py b/client.py index 3d371cb..2daa8d9 100644 --- a/client.py +++ b/client.py @@ -1,7 +1,8 @@ import time +import os import httpx from datetime import datetime, timezone -from typing import Optional +from typing import Optional, Union from .types import Config, MaxunError @@ -161,6 +162,41 @@ async def extract_with_llm(self, options: dict): self.client.post("/extract/llm", json=options, timeout=300) ) + async def create_document_robot( + self, + file: Union[str, bytes], + prompt: str, + robot_name: Optional[str] = None, + ollama_model: Optional[str] = None, + file_name: Optional[str] = None, + ) -> dict: + """Create a document-extraction robot from a PDF file path or bytes.""" + if isinstance(file, str): + file_name = file_name or os.path.basename(file) + with open(file, 'rb') as f: + file_bytes = f.read() + else: + file_bytes = file + file_name = file_name or 'document.pdf' + + data = {'prompt': prompt} + if robot_name: + data['robotName'] = robot_name + if ollama_model: + data['ollamaModel'] = ollama_model + + response = await self.client.post( + '/robots/document', + files={'file': (file_name, file_bytes, 'application/pdf')}, + data=data, + timeout=120, + ) + response.raise_for_status() + body = response.json() + if not body.get('data') and not body.get('robot'): + raise MaxunError('Failed to create document robot') + return body + async def create_crawl_robot(self, url: str, options: dict): return await self._handle( self.client.post("/crawl", json={"url": url, **options}) From bdafdcd87e637389e65b7921317e3fbdf1ec8b89 Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Thu, 7 May 2026 17:22:56 +0530 Subject: [PATCH 2/3] feat: add document parsing support --- client.py | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/client.py b/client.py index 2daa8d9..8124e55 100644 --- a/client.py +++ b/client.py @@ -12,7 +12,8 @@ def __init__(self, config: Config): headers = { "x-api-key": self.api_key, - "Content-Type": "application/json", + # Content-Type is intentionally omitted here so httpx can set it + # correctly per request (e.g. multipart/form-data for file uploads) } if config.team_id: @@ -197,6 +198,47 @@ async def create_document_robot( raise MaxunError('Failed to create document robot') return body + async def create_document_parse_robot( + self, + file: Union[str, bytes], + output_formats: list, + robot_name: Optional[str] = None, + file_name: Optional[str] = None, + ) -> dict: + """Create a document-parse robot from a PDF file path or bytes.""" + if isinstance(file, str): + file_name = file_name or os.path.basename(file) + with open(file, 'rb') as f: + file_bytes = f.read() + else: + file_bytes = file + file_name = file_name or 'document.pdf' + + valid_formats = {'markdown', 'html', 'links'} + filtered = [f for f in output_formats if f in valid_formats] + if not filtered: + raise MaxunError('At least one valid output format is required (markdown, html, links)') + + data = {} + if robot_name: + data['robotName'] = robot_name + + files_payload = [('file', (file_name, file_bytes, 'application/pdf'))] + for fmt in filtered: + files_payload.append(('outputFormats[]', (None, fmt))) + + response = await self.client.post( + '/robots/document-parse', + files=files_payload, + data=data, + timeout=120, + ) + response.raise_for_status() + body = response.json() + if not body.get('data') and not body.get('robot'): + raise MaxunError('Failed to create document-parse robot') + return body + async def create_crawl_robot(self, url: str, options: dict): return await self._handle( self.client.post("/crawl", json={"url": url, **options}) From 30df5b4252b5ee787cd675268588c9e948c3a6bc Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Fri, 8 May 2026 12:19:19 +0530 Subject: [PATCH 3/3] chore: rename doc methods --- client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client.py b/client.py index 8124e55..9b686ac 100644 --- a/client.py +++ b/client.py @@ -163,7 +163,7 @@ async def extract_with_llm(self, options: dict): self.client.post("/extract/llm", json=options, timeout=300) ) - async def create_document_robot( + async def create_document_extract_robot( self, file: Union[str, bytes], prompt: str,