MemTensor · tianxing02 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025
diff --git a/docker/.env.example → .env.example b/docker/.env.example → .env.example
diff --git a/evaluation/scripts/hotpot/hotpot_eval.py b/evaluation/scripts/hotpot/hotpot_eval.py
@@ -0,0 +1,224 @@
+import json
+import os
+import uuid
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from dotenv import load_dotenv
+from tqdm import tqdm
+
+from memos.configs.mem_cube import GeneralMemCubeConfig
+from memos.configs.mem_os import MOSConfig
+from memos.mem_cube.general import GeneralMemCube
+from memos.mem_os.main import MOS
+
+
+load_dotenv()
+
+db_name = "stx-hotpot-001"
+
+
+user_name = str(uuid.uuid4())
+
+# 1.1 Set openai config
+openapi_config = {
+    "model_name_or_path": "gpt-4o-mini",
+    "temperature": 0.8,
+    "max_tokens": 1024,
+    "top_p": 0.9,
+    "top_k": 50,
+    "remove_think_prefix": True,
+    "api_key": os.getenv("OPENAI_API_KEY", "sk-xxxxx"),
+    "api_base": os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"),
+}
+# 1.2 Set neo4j config
+neo4j_uri = os.getenv("NEO4J_URI", "bolt://localhost:7687")
+
+# 1.3  Create MOS Config
+config = {
+    "user_id": user_name,
+    "chat_model": {
+        "backend": "openai",
+        "config": openapi_config,
+    },
+    "mem_reader": {
+        "backend": "simple_struct",
+        "config": {
+            "llm": {
+                "backend": "openai",
+                "config": openapi_config,
+            },
+            "embedder": {
+                "backend": "universal_api",
+                "config": {
+                    "provider": "openai",
+                    "api_key": os.getenv("OPENAI_API_KEY", "sk-xxxxx"),
+                    "model_name_or_path": "text-embedding-3-large",
+                    "base_url": os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"),
+                },
+            },
+            "chunker": {
+                "backend": "sentence",
+                "config": {
+                    "tokenizer_or_token_counter": "gpt2",
+                    "chunk_size": 512,
+                    "chunk_overlap": 128,
+                    "min_sentences_per_chunk": 1,
+                },
+            },
+        },
+    },
+    "max_turns_window": 20,
+    "top_k": 5,
+    "enable_textual_memory": True,
+    "enable_activation_memory": False,
+    "enable_parametric_memory": False,
+}
+
+mos_config = MOSConfig(**config)
+# you can set PRO_MODE to True to enable CoT enhancement mos_config.PRO_MODE = True
+mos = MOS(mos_config)
+
+
+# Filter out embedding fields, keeping only necessary fields
+def filter_memory_data(memories_data):
+    filtered_data = {}
+    for key, value in memories_data.items():
+        if key == "text_mem":
+            filtered_data[key] = []
+            for mem_group in value:
+                # Check if it's the new data structure (list of TextualMemoryItem objects)
+                if "memories" in mem_group and isinstance(mem_group["memories"], list):
+                    # New data structure: directly a list of TextualMemoryItem objects
+                    filtered_memories = []
+                    for memory_item in mem_group["memories"]:
+                        # Create filtered dictionary
+                        filtered_item = {
+                            "id": memory_item.id,
+                            "memory": memory_item.memory,
+                            "metadata": {},
+                        }
+                        # Filter metadata, excluding embedding
+                        if hasattr(memory_item, "metadata") and memory_item.metadata:
+                            for attr_name in dir(memory_item.metadata):
+                                if not attr_name.startswith("_") and attr_name != "embedding":
+                                    attr_value = getattr(memory_item.metadata, attr_name)
+                                    if not callable(attr_value):
+                                        filtered_item["metadata"][attr_name] = attr_value
+                        filtered_memories.append(filtered_item)
+
+                    filtered_group = {
+                        "cube_id": mem_group.get("cube_id", ""),
+                        "memories": filtered_memories,
+                    }
+                    filtered_data[key].append(filtered_group)
+                else:
+                    # Old data structure: dictionary with nodes and edges
+                    filtered_group = {
+                        "memories": {"nodes": [], "edges": mem_group["memories"].get("edges", [])}
+                    }
+                    for node in mem_group["memories"].get("nodes", []):
+                        filtered_node = {
+                            "id": node.get("id"),
+                            "memory": node.get("memory"),
+                            "metadata": {
+                                k: v
+                                for k, v in node.get("metadata", {}).items()
+                                if k != "embedding"
+                            },
+                        }
+                        filtered_group["memories"]["nodes"].append(filtered_node)
+                    filtered_data[key].append(filtered_group)
+        else:
+            filtered_data[key] = value
+    return filtered_data
+
+
+config = GeneralMemCubeConfig.model_validate(
+    {
+        "user_id": user_name,
+        "cube_id": f"{user_name}",
+        "text_mem": {
+            "backend": "tree_text",
+            "config": {
+                "extractor_llm": {
+                    "backend": "openai",
+                    "config": openapi_config,
+                },
+                "dispatcher_llm": {
+                    "backend": "openai",
+                    "config": openapi_config,
+                },
+                "graph_db": {
+                    "backend": "neo4j",
+                    "config": {
+                        "uri": neo4j_uri,
+                        "user": "neo4j",
+                        "password": "iaarlichunyu",
+                        "db_name": db_name,
+                        "auto_create": True,
+                    },
+                },
+                "embedder": {
+                    "backend": "universal_api",
+                    "config": {
+                        "provider": "openai",
+                        "api_key": os.getenv("OPENAI_API_KEY", "sk-xxxxx"),
+                        "model_name_or_path": "text-embedding-3-large",
+                        "base_url": os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"),
+                    },
+                },
+                "reorganize": True,
+            },
+        },
+        "act_mem": {},
+        "para_mem": {},
+    },
+)
+
+mem_cube = GeneralMemCube(config)
+
+
+mos.register_mem_cube(f"/tmp/{user_name}", mem_cube_id=user_name)
+
+
+with open("evaluation/data/hotpot/hotpot_dev_distractor_v1.json") as f:
+    data = json.load(f)
+
+
+def build_context_text(context_list):
+    parts = []
+    for title, sentences in context_list:
+        text = " ".join(s.strip() for s in sentences if s.strip())
+        parts.append(f"{title}: {text}")
+    return "\n".join(parts)
+
+
+def build_and_ask(item):
+    qid = item["_id"]
+    question = item["question"]
+
+    for title, sentences in item["context"]:
+        text = " ".join(s.strip() for s in sentences if s.strip())
+        memory_content = f"{title}: {text}"
+        mos.add(memory_content=memory_content)
+
+    answer = mos.chat(question).strip()
+    return qid, answer
+
+
+pred_answers = {}
+
+with ThreadPoolExecutor(max_workers=5) as executor:
+    futures = {executor.submit(build_and_ask, item): item for item in data}
+    for future in tqdm(as_completed(futures), total=len(futures)):
+        try:
+            qid, answer = future.result()
+            pred_answers[qid] = answer
+        except Exception as e:
+            print(f"Error: {e}")
+
+predictions = {"answer": pred_answers, "sp": []}
+
+with open("evaluation/data/hotpot/output/dev_distractor_pred.json", "w") as f:
+    json.dump(predictions, f, ensure_ascii=False, indent=2)
diff --git a/evaluation/scripts/hotpot/hotpot_evaluate_v1.py b/evaluation/scripts/hotpot/hotpot_evaluate_v1.py
@@ -0,0 +1,151 @@
+import re
+import string
+import sys
+
+from collections import Counter
+
+import ujson as json
+
+
+def normalize_answer(s):
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_score(prediction, ground_truth):
+    normalized_prediction = normalize_answer(prediction)
+    normalized_ground_truth = normalize_answer(ground_truth)
+
+    zero_metric = (0, 0, 0)
+
+    if (
+        normalized_prediction in ["yes", "no", "noanswer"]
+        and normalized_prediction != normalized_ground_truth
+    ):
+        return zero_metric
+    if (
+        normalized_ground_truth in ["yes", "no", "noanswer"]
+        and normalized_prediction != normalized_ground_truth
+    ):
+        return zero_metric
+
+    prediction_tokens = normalized_prediction.split()
+    ground_truth_tokens = normalized_ground_truth.split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return zero_metric
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1, precision, recall
+
+
+def exact_match_score(prediction, ground_truth):
+    return normalize_answer(prediction) == normalize_answer(ground_truth)
+
+
+def update_answer(metrics, prediction, gold):
+    em = exact_match_score(prediction, gold)
+    f1, prec, recall = f1_score(prediction, gold)
+    metrics["em"] += float(em)
+    metrics["f1"] += f1
+    metrics["prec"] += prec
+    metrics["recall"] += recall
+    return em, prec, recall
+
+
+def update_sp(metrics, prediction, gold):
+    cur_sp_pred = set(map(tuple, prediction))
+    gold_sp_pred = set(map(tuple, gold))
+    tp, fp, fn = 0, 0, 0
+    for e in cur_sp_pred:
+        if e in gold_sp_pred:
+            tp += 1
+        else:
+            fp += 1
+    for e in gold_sp_pred:
+        if e not in cur_sp_pred:
+            fn += 1
+    prec = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
+    recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
+    f1 = 2 * prec * recall / (prec + recall) if prec + recall > 0 else 0.0
+    em = 1.0 if fp + fn == 0 else 0.0
+    metrics["sp_em"] += em
+    metrics["sp_f1"] += f1
+    metrics["sp_prec"] += prec
+    metrics["sp_recall"] += recall
+    return em, prec, recall
+
+
+def eval(prediction_file, gold_file):
+    with open(prediction_file) as f:
+        prediction = json.load(f)
+    with open(gold_file) as f:
+        gold = json.load(f)
+
+    metrics = {
+        "em": 0,
+        "f1": 0,
+        "prec": 0,
+        "recall": 0,
+        "sp_em": 0,
+        "sp_f1": 0,
+        "sp_prec": 0,
+        "sp_recall": 0,
+        "joint_em": 0,
+        "joint_f1": 0,
+        "joint_prec": 0,
+        "joint_recall": 0,
+    }
+    for dp in gold:
+        cur_id = dp["_id"]
+        can_eval_joint = True
+        if cur_id not in prediction["answer"]:
+            print(f"missing answer {cur_id}")
+            can_eval_joint = False
+        else:
+            em, prec, recall = update_answer(metrics, prediction["answer"][cur_id], dp["answer"])
+        if cur_id not in prediction["sp"]:
+            print(f"missing sp fact {cur_id}")
+            can_eval_joint = False
+        else:
+            sp_em, sp_prec, sp_recall = update_sp(
+                metrics, prediction["sp"][cur_id], dp["supporting_facts"]
+            )
+
+        if can_eval_joint:
+            joint_prec = prec * sp_prec
+            joint_recall = recall * sp_recall
+            if joint_prec + joint_recall > 0:
+                joint_f1 = 2 * joint_prec * joint_recall / (joint_prec + joint_recall)
+            else:
+                joint_f1 = 0.0
+            joint_em = em * sp_em
+
+            metrics["joint_em"] += joint_em
+            metrics["joint_f1"] += joint_f1
+            metrics["joint_prec"] += joint_prec
+            metrics["joint_recall"] += joint_recall
+
+    n = len(gold)
+    for k in metrics:
+        metrics[k] /= n
+
+    print(metrics)
+
+
+if __name__ == "__main__":
+    eval(sys.argv[1], sys.argv[2])
diff --git a/evaluation/scripts/longbenchV2/import_data.py b/evaluation/scripts/longbenchV2/import_data.py
@@ -0,0 +1,20 @@
+from datasets import load_dataset
+
+
+dataset = load_dataset("zai-org/LongBench-v2", split="train")
+print(dataset)
+
+
+def truncate(value, max_len=200):
+    if isinstance(value, str) and len(value) > max_len:
+        return value[:max_len] + "... [TRUNCATED]"
+    return value
+
+
+for i in range(10):
+    sample = dataset[i]
+    print(f"========== Sample {i} ==========")
+    for key, value in sample.items():
+        print(f"{key}: {truncate(value)}")
+
+    print("\n")
diff --git a/evaluation/scripts/mmlongbench/eval/__init__.py b/evaluation/scripts/mmlongbench/eval/__init__.py