feat/complete multi modal (#570)

CaralHsi · web-flow · commit 0c0a402e78d6 · 2025-12-02T11:48:28.000+08:00
* fix: multi-model memreader init error

* fix: kwargs bug

* feat: init examples for each multi-model parser

* feat: simple user_parser

* feat: add multi-model-parser example

* feat: add multi-model-parser example

* feat: update user parser: only tackle with ChatCompletionUserMessageParam message

* feat: rewrite create source and parse fast for system parser

* feat: rewrite create source and parse fast for system parser

* feat: rewrite assistant parser

* feat: add additional sources to assistant parser

* feat: add concat fast-mode memories from multi parsers

* refactor: fix name

* refactor: fix name

* refactor: fix name

* refactor: fix name

* refactor: fix name

* refactor: fix name

* feat: add fine process path-A in multi_modal_struct

* feat: add fine process path-A in multi_modal_struct

* feat: add compare simple&amp;multimodal example

* feat: add _process_transfer_multi_modal_data in multimodal

* feat: add image type

* feat: add tool role; update string/text/tool parser

* feat: update file_content_parser and multimodal reader

* feat: default mem-reader for api is not set to multimodal reqader

* feat: add exmples

* feat: temperal fix server router bug
diff --git a/examples/api/server_router_api.py b/examples/api/server_router_api.py
@@ -181,6 +181,91 @@ def example_03_assistant_with_tool_calls():
 
 # ===========================================================================
 # 4. MultiModel messages
+def example_03b_tool_message_with_result():
+    """
+    Tool message returning the result of a tool call.
+
+    - `role = tool`, `content` contains the tool execution result.
+    - `tool_call_id` links this message to the original tool call.
+    - This is the standard format for tool execution results in OpenAI-style conversations.
+    """
+    payload = {
+        "user_id": USER_ID,
+        "writable_cube_ids": [MEM_CUBE_ID],
+        "messages": [
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [
+                    {
+                        "id": "tool-call-weather-1",
+                        "type": "function",
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": '{"location": "北京"}',
+                        },
+                    }
+                ],
+                "chat_time": "2025-11-24T10:12:00Z",
+                "message_id": "assistant-with-call-1",
+            },
+            {
+                "role": "tool",
+                "content": "北京今天天气晴朗，温度25°C，湿度60%。",
+                "tool_call_id": "tool-call-weather-1",
+                "chat_time": "2025-11-24T10:12:05Z",
+                "message_id": "tool-result-1",
+            },
+        ],
+        "info": {"source_type": "tool_execution"},
+    }
+    call_add_api("03b_tool_message_with_result", payload)
+
+
+def example_03c_tool_description_input_output():
+    """
+    Custom tool message format: tool_description, tool_input, tool_output.
+
+    - This demonstrates the custom tool message format (not OpenAI standard).
+    - `tool_description`: describes the tool/function definition.
+    - `tool_input`: the input parameters for the tool call.
+    - `tool_output`: the result/output from the tool execution.
+    - These are alternative formats for representing tool interactions.
+    """
+    payload = {
+        "user_id": USER_ID,
+        "writable_cube_ids": [MEM_CUBE_ID],
+        "messages": [
+            {
+                "type": "tool_description",
+                "name": "get_weather",
+                "description": "获取指定地点的当前天气信息",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"location": {"type": "string", "description": "城市名称"}},
+                    "required": ["location"],
+                },
+            },
+            {
+                "type": "tool_input",
+                "call_id": "call_123",
+                "name": "get_weather",
+                "argument": {"location": "北京"},
+            },
+            {
+                "type": "tool_output",
+                "call_id": "call_123",
+                "name": "get_weather",
+                "output": {"weather": "晴朗", "temperature": 25, "humidity": 60},
+            },
+        ],
+        "info": {"source_type": "custom_tool_format"},
+    }
+    call_add_api("03c_tool_description_input_output", payload)
+
+
+# ===========================================================================
+# 4. Multimodal messages
 # ===========================================================================
 
 
@@ -414,6 +499,56 @@ def example_09b_pure_file_input_by_file_data():
     call_add_api("09b_pure_file_input_by_file_data", payload)
 
 
+def example_09c_pure_file_input_by_oss_url():
+    """
+    Pure file input item using file_data with OSS URL.
+
+    - Uses `file_data` with OSS URL (object storage service URL).
+    - This format is used when files are stored in cloud storage (e.g., Alibaba Cloud OSS).
+    - The file_data field accepts both base64-encoded content and OSS URLs.
+    """
+    payload = {
+        "user_id": USER_ID,
+        "writable_cube_ids": [MEM_CUBE_ID],
+        "messages": [
+            {
+                "type": "file",
+                "file": {
+                    "file_data": "oss_url",  # OSS URL instead of base64
+                    "filename": "document.pdf",
+                },
+            }
+        ],
+        "info": {"source_type": "file_ingestion_oss"},
+    }
+    call_add_api("09c_pure_file_input_by_oss_url", payload)
+
+
+def example_09d_pure_image_input():
+    """
+    Pure image input item without dialog context.
+
+    - This demonstrates adding an image as a standalone input item (not part of a conversation).
+    - Uses the same format as pure text/file inputs, but with image_url type.
+    - Useful for batch image ingestion or when images don't have associated dialog.
+    """
+    payload = {
+        "user_id": USER_ID,
+        "writable_cube_ids": [MEM_CUBE_ID],
+        "messages": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://example.com/standalone_image.jpg",
+                    "detail": "high",
+                },
+            }
+        ],
+        "info": {"source_type": "image_ingestion"},
+    }
+    call_add_api("09d_pure_image_input", payload)
+
+
 def example_10_mixed_text_file_image():
     """
     Mixed multimodal message: text + file + image in a single user message.
@@ -619,6 +754,96 @@ def example_16_feedback_add():
     call_add_api("16_feedback_add", payload)
 
 
+def example_17_family_travel_conversation():
+    """
+    Multi-turn conversation example: family travel planning.
+
+    - Demonstrates a complete conversation with multiple user-assistant exchanges.
+    - Shows how to add a full conversation history in a single request.
+    - Uses async_mode for asynchronous processing.
+    - This example shows a Chinese conversation about summer travel planning for families.
+    """
+    payload = {
+        "user_id": "memos_automated_testing",
+        "writable_cube_ids": [MEM_CUBE_ID],
+        "session_id": "0610",
+        "async_mode": "async",
+        "messages": [
+            {
+                "role": "user",
+                "content": "我想暑假出去玩，你能帮我推荐下吗？",
+            },
+            {
+                "role": "assistant",
+                "content": "好的！是自己出行还是和家人朋友一起呢？",
+            },
+            {
+                "role": "user",
+                "content": "肯定要带孩子啊，我们家出门都是全家一起。",
+            },
+            {
+                "role": "assistant",
+                "content": "明白了，所以你们是父母带孩子一块儿旅行，对吗？",
+            },
+            {
+                "role": "user",
+                "content": "对，带上孩子和老人，一般都是全家行动。",
+            },
+            {
+                "role": "assistant",
+                "content": "收到，那我会帮你推荐适合家庭出游的目的地。",
+            },
+        ],
+        "custom_tags": [],
+        "info": {
+            "source_type": "chat",
+            "conversation_id": "0610",
+        },
+    }
+    call_add_api("17_family_travel_conversation", payload)
+
+
+def example_18_add_with_chat_history():
+    """
+    Add memory with chat_history field.
+
+    - `chat_history` provides additional conversation context separate from `messages`.
+    - This is useful when you want to add specific messages while providing broader context.
+    - The chat_history helps the system understand the conversation flow better.
+    """
+    payload = {
+        "user_id": USER_ID,
+        "writable_cube_ids": [MEM_CUBE_ID],
+        "session_id": "session_with_history",
+        "messages": [
+            {
+                "role": "user",
+                "content": "我想了解一下这个产品的价格。",
+            },
+            {
+                "role": "assistant",
+                "content": "好的，我来为您查询价格信息。",
+            },
+        ],
+        "chat_history": [
+            {
+                "role": "system",
+                "content": "You are a helpful product assistant.",
+            },
+            {
+                "role": "user",
+                "content": "你好，我想咨询产品信息。",
+            },
+            {
+                "role": "assistant",
+                "content": "您好！我很乐意为您提供产品信息。",
+            },
+        ],
+        "info": {"source_type": "chat_with_history"},
+    }
+    call_add_api("18_add_with_chat_history", payload)
+
+
 # ===========================================================================
 # Entry point
 # ===========================================================================
@@ -628,17 +853,23 @@ def example_16_feedback_add():
     example_01_string_message_minimal()
     example_02_standard_chat_triplet()
     example_03_assistant_with_tool_calls()
+    example_03b_tool_message_with_result()
+    example_03c_tool_description_input_output()
     example_04_extreme_multimodal_single_message()
     example_05_multimodal_text_and_image()
     example_06_multimodal_text_and_file()
     example_07_audio_only_message()
     example_08_pure_text_input_items()
     example_09_pure_file_input_by_file_id()
     example_09b_pure_file_input_by_file_data()
+    example_09c_pure_file_input_by_oss_url()
+    example_09d_pure_image_input()
     example_10_mixed_text_file_image()
     example_11_deprecated_memory_content_and_doc_path()
     example_12_async_default_pipeline()
     example_13_sync_fast_pipeline()
     example_14_sync_fine_pipeline()
     example_15_async_with_task_id()
     example_16_feedback_add()
+    example_17_family_travel_conversation()
+    example_18_add_with_chat_history()
diff --git a/examples/mem_reader/multimodal_struct_reader.py b/examples/mem_reader/multimodal_struct_reader.py
@@ -164,6 +164,38 @@ def get_info(self) -> dict[str, Any]:
             ]
         ],
     ),
+    TestCase(
+        name="chat_with_list_content",
+        description="",
+        scene_data=[
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "我是测试base64",
+                        },
+                        {
+                            "type": "file",
+                            "file": {
+                                "file_data": "Hello World",
+                                "filename": "2102b64c-25a2-481c-a940-4325496baf39.txt",
+                                "file_id": "90ee1bcf-5295-4b75-91a4-23fe1f7ab30a",
+                            },
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": "https://play-groud-test-1.oss-cn-shanghai.aliyuncs.com/algorithmImages/2025/12/01/ce545319ba6d4d21a0aebcb75337acc3.jpeg"
+                            },
+                        },
+                    ],
+                    "message_id": "1995458892790317057",
+                }
+            ]
+        ],
+    ),
 ]
 
 # Tool-related test cases
diff --git a/src/memos/api/product_models.py b/src/memos/api/product_models.py
@@ -6,7 +6,7 @@
 
 # Import message types from core types module
 from memos.log import get_logger
-from memos.types import MessageList, MessagesType, PermissionDict, SearchMode
+from memos.types import PermissionDict, SearchMode
 
 
 logger = get_logger(__name__)
@@ -56,7 +56,7 @@ class Message(BaseModel):
 
 class MemoryCreate(BaseRequest):
     user_id: str = Field(..., description="User ID")
-    messages: list[Message] | None = Field(None, description="List of messages to store.")
+    messages: list | None = Field(None, description="List of messages to store.")
     memory_content: str | None = Field(None, description="Content to store as memory")
     doc_path: str | None = Field(None, description="Path to document to store")
     mem_cube_id: str | None = Field(None, description="ID of the memory cube")
@@ -83,7 +83,7 @@ class ChatRequest(BaseRequest):
     writable_cube_ids: list[str] | None = Field(
         None, description="List of cube IDs user can write for multi-cube chat"
     )
-    history: MessageList | None = Field(None, description="Chat history")
+    history: list | None = Field(None, description="Chat history")
     mode: SearchMode = Field(SearchMode.FAST, description="search mode: fast, fine, or mixture")
     system_prompt: str | None = Field(None, description="Base system prompt to use for chat")
     top_k: int = Field(10, description="Number of results to return")
@@ -165,7 +165,7 @@ class ChatCompleteRequest(BaseRequest):
     user_id: str = Field(..., description="User ID")
     query: str = Field(..., description="Chat query message")
     mem_cube_id: str | None = Field(None, description="Cube ID to use for chat")
-    history: MessageList | None = Field(None, description="Chat history")
+    history: list | None = Field(None, description="Chat history")
     internet_search: bool = Field(False, description="Whether to use internet search")
     system_prompt: str | None = Field(None, description="Base prompt to use for chat")
     top_k: int = Field(10, description="Number of results to return")
@@ -251,7 +251,7 @@ class MemoryCreateRequest(BaseRequest):
     """Request model for creating memories."""
 
     user_id: str = Field(..., description="User ID")
-    messages: MessagesType | None = Field(None, description="List of messages to store.")
+    messages: str | list | None = Field(None, description="List of messages to store.")
     memory_content: str | None = Field(None, description="Memory content to store")
     doc_path: str | None = Field(None, description="Path to document to store")
     mem_cube_id: str | None = Field(None, description="Cube ID")
@@ -360,7 +360,7 @@ class APISearchRequest(BaseRequest):
     )
 
     # ==== Context ====
-    chat_history: MessageList | None = Field(
+    chat_history: list | None = Field(
         None,
         description=(
             "Historical chat messages used internally by algorithms. "
@@ -490,7 +490,7 @@ class APIADDRequest(BaseRequest):
     )
 
     # ==== Input content ====
-    messages: MessagesType | None = Field(
+    messages: str | list | None = Field(
         None,
         description=(
             "List of messages to store. Supports: "
@@ -506,7 +506,7 @@ class APIADDRequest(BaseRequest):
     )
 
     # ==== Chat history ====
-    chat_history: MessageList | None = Field(
+    chat_history: list | None = Field(
         None,
         description=(
             "Historical chat messages used internally by algorithms. "
@@ -639,7 +639,7 @@ class APIChatCompleteRequest(BaseRequest):
     writable_cube_ids: list[str] | None = Field(
         None, description="List of cube IDs user can write for multi-cube chat"
     )
-    history: MessageList | None = Field(None, description="Chat history")
+    history: list | None = Field(None, description="Chat history")
     mode: SearchMode = Field(SearchMode.FAST, description="search mode: fast, fine, or mixture")
     system_prompt: str | None = Field(None, description="Base system prompt to use for chat")
     top_k: int = Field(10, description="Number of results to return")
@@ -707,7 +707,7 @@ class SuggestionRequest(BaseRequest):
     user_id: str = Field(..., description="User ID")
     mem_cube_id: str = Field(..., description="Cube ID")
     language: Literal["zh", "en"] = Field("zh", description="Language for suggestions")
-    message: MessagesType | None = Field(None, description="List of messages to store.")
+    message: list | None = Field(None, description="List of messages to store.")
 
 
 # ─── MemOS Client Response Models ──────────────────────────────────────────────