Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
4f5b696
feat: one-click deployment with docker
tianxing02 Jul 21, 2025
7c6d58c
Merge remote-tracking branch 'origin/dev' into dev
tianxing02 Jul 21, 2025
faeea51
feat: one-click deployment with docker
tianxing02 Jul 21, 2025
8364606
Merge remote-tracking branch 'origin/dev' into dev
tianxing02 Jul 21, 2025
6942a7e
Merge remote-tracking branch 'origin/dev' into dev
tianxing02 Jul 21, 2025
a1158c9
feat: one-click deployment with docker
tianxing02 Jul 21, 2025
3964580
feat: one-click deployment with docker
tianxing02 Jul 21, 2025
b01decb
Merge branch 'MemTensor:dev' into dev
tianxing02 Jul 21, 2025
879a0e8
feat: one-click deployment with docker
tianxing02 Jul 21, 2025
e878ebf
Merge remote-tracking branch 'origin/dev' into dev
tianxing02 Jul 21, 2025
4a968f7
feat: one-click deployment with docker
tianxing02 Jul 21, 2025
4fd3573
feat: one-click deployment with docker
tianxing02 Jul 21, 2025
872312f
feat: one-click deployment with docker
tianxing02 Jul 21, 2025
b8e9bc4
Merge branch 'MemTensor:dev' into dev
tianxing02 Jul 22, 2025
b89d091
feat: one-click deployment with docker
tianxing02 Jul 22, 2025
2aa5db8
feat: one-click deployment with docker
tianxing02 Jul 22, 2025
19bdbfb
feat: docker settings modify
tianxing02 Jul 22, 2025
c3bf76f
Merge branch 'dev' into dev
CaralHsi Jul 23, 2025
1558921
Update tree_config.json
tianxing02 Jul 23, 2025
ed20468
Update simple_openapi_memos.py
tianxing02 Jul 23, 2025
b13d633
feat: docker settings modify
tianxing02 Jul 23, 2025
d109a80
Merge remote-tracking branch 'origin/dev' into dev
tianxing02 Jul 23, 2025
586f72d
Merge remote-tracking branch 'origin/dev' into dev
tianxing02 Jul 28, 2025
b8be22c
Merge remote-tracking branch 'origin/dev' into dev
tianxing02 Jul 29, 2025
8b53fe8
Merge remote-tracking branch 'origin/dev' into dev
tianxing02 Jul 29, 2025
94e996d
Merge remote-tracking branch 'origin/dev' into dev
tianxing02 Jul 31, 2025
117606f
Merge remote-tracking branch 'origin/dev' into dev
tianxing02 Aug 1, 2025
275a0f0
Merge remote-tracking branch 'origin/dev' into dev
tianxing02 Aug 6, 2025
cc14bc7
Merge remote-tracking branch 'origin/dev' into dev
tianxing02 Aug 7, 2025
70c475a
feat:add hotpotQA evaluation
tianxing02 Aug 12, 2025
ea29337
Merge remote-tracking branch 'origin/dev' into dev
tianxing02 Aug 13, 2025
7e23cab
feat:add MMLongbench-doc evaluation
tianxing02 Aug 15, 2025
03329c4
feat:add MMLongbench-doc evaluation
tianxing02 Aug 19, 2025
1331c69
feat:add MMLongbench-doc evaluation
tianxing02 Aug 19, 2025
2e6e692
feat:add MMLongbench-doc evaluation
tianxing02 Aug 21, 2025
90464d6
feat:add MMLongbench-doc evaluation
tianxing02 Aug 28, 2025
c398fea
feat:add MMLongbench-doc evaluation
tianxing02 Aug 29, 2025
485b3e8
feat:add MMLongbench-doc evaluation
tianxing02 Aug 29, 2025
313bbc1
feat:update 10.23
tianxing02 Oct 23, 2025
b589acd
feat:update 11.12
tianxing02 Nov 12, 2025
b0500f1
feat:update 11.12
tianxing02 Nov 12, 2025
43f52a5
feat:update 11.12
tianxing02 Nov 12, 2025
2608346
feat:update 11.17
tianxing02 Nov 17, 2025
5d2e333
feat:update 11.17
tianxing02 Nov 17, 2025
4f62e35
feat:update 11.25
tianxing02 Nov 25, 2025
dd34693
feat: add MMLongbench-Doc, HotpotQA, xinyu evaluation
tianxing02 Nov 25, 2025
d9d2f8c
feat:update 11.27
tianxing02 Nov 27, 2025
8207cc9
feat:update 11.27
tianxing02 Nov 27, 2025
74654a6
feat:update 11.27
tianxing02 Nov 27, 2025
9cae814
feat:update 11.27
tianxing02 Nov 27, 2025
530a2ca
feat:update 11.27
tianxing02 Nov 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
224 changes: 224 additions & 0 deletions evaluation/scripts/hotpot/hotpot_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
import json
import os
import uuid

from concurrent.futures import ThreadPoolExecutor, as_completed

from dotenv import load_dotenv
from tqdm import tqdm

from memos.configs.mem_cube import GeneralMemCubeConfig
from memos.configs.mem_os import MOSConfig
from memos.mem_cube.general import GeneralMemCube
from memos.mem_os.main import MOS


load_dotenv()

db_name = "stx-hotpot-001"


user_name = str(uuid.uuid4())

# 1.1 Set openai config
openapi_config = {
"model_name_or_path": "gpt-4o-mini",
"temperature": 0.8,
"max_tokens": 1024,
"top_p": 0.9,
"top_k": 50,
"remove_think_prefix": True,
"api_key": os.getenv("OPENAI_API_KEY", "sk-xxxxx"),
"api_base": os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"),
}
# 1.2 Set neo4j config
neo4j_uri = os.getenv("NEO4J_URI", "bolt://localhost:7687")

# 1.3 Create MOS Config
config = {
"user_id": user_name,
"chat_model": {
"backend": "openai",
"config": openapi_config,
},
"mem_reader": {
"backend": "simple_struct",
"config": {
"llm": {
"backend": "openai",
"config": openapi_config,
},
"embedder": {
"backend": "universal_api",
"config": {
"provider": "openai",
"api_key": os.getenv("OPENAI_API_KEY", "sk-xxxxx"),
"model_name_or_path": "text-embedding-3-large",
"base_url": os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"),
},
},
"chunker": {
"backend": "sentence",
"config": {
"tokenizer_or_token_counter": "gpt2",
"chunk_size": 512,
"chunk_overlap": 128,
"min_sentences_per_chunk": 1,
},
},
},
},
"max_turns_window": 20,
"top_k": 5,
"enable_textual_memory": True,
"enable_activation_memory": False,
"enable_parametric_memory": False,
}

mos_config = MOSConfig(**config)
# you can set PRO_MODE to True to enable CoT enhancement mos_config.PRO_MODE = True
mos = MOS(mos_config)


# Filter out embedding fields, keeping only necessary fields
def filter_memory_data(memories_data):
filtered_data = {}
for key, value in memories_data.items():
if key == "text_mem":
filtered_data[key] = []
for mem_group in value:
# Check if it's the new data structure (list of TextualMemoryItem objects)
if "memories" in mem_group and isinstance(mem_group["memories"], list):
# New data structure: directly a list of TextualMemoryItem objects
filtered_memories = []
for memory_item in mem_group["memories"]:
# Create filtered dictionary
filtered_item = {
"id": memory_item.id,
"memory": memory_item.memory,
"metadata": {},
}
# Filter metadata, excluding embedding
if hasattr(memory_item, "metadata") and memory_item.metadata:
for attr_name in dir(memory_item.metadata):
if not attr_name.startswith("_") and attr_name != "embedding":
attr_value = getattr(memory_item.metadata, attr_name)
if not callable(attr_value):
filtered_item["metadata"][attr_name] = attr_value
filtered_memories.append(filtered_item)

filtered_group = {
"cube_id": mem_group.get("cube_id", ""),
"memories": filtered_memories,
}
filtered_data[key].append(filtered_group)
else:
# Old data structure: dictionary with nodes and edges
filtered_group = {
"memories": {"nodes": [], "edges": mem_group["memories"].get("edges", [])}
}
for node in mem_group["memories"].get("nodes", []):
filtered_node = {
"id": node.get("id"),
"memory": node.get("memory"),
"metadata": {
k: v
for k, v in node.get("metadata", {}).items()
if k != "embedding"
},
}
filtered_group["memories"]["nodes"].append(filtered_node)
filtered_data[key].append(filtered_group)
else:
filtered_data[key] = value
return filtered_data


config = GeneralMemCubeConfig.model_validate(
{
"user_id": user_name,
"cube_id": f"{user_name}",
"text_mem": {
"backend": "tree_text",
"config": {
"extractor_llm": {
"backend": "openai",
"config": openapi_config,
},
"dispatcher_llm": {
"backend": "openai",
"config": openapi_config,
},
"graph_db": {
"backend": "neo4j",
"config": {
"uri": neo4j_uri,
"user": "neo4j",
"password": "iaarlichunyu",
"db_name": db_name,
"auto_create": True,
},
},
"embedder": {
"backend": "universal_api",
"config": {
"provider": "openai",
"api_key": os.getenv("OPENAI_API_KEY", "sk-xxxxx"),
"model_name_or_path": "text-embedding-3-large",
"base_url": os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1"),
},
},
"reorganize": True,
},
},
"act_mem": {},
"para_mem": {},
},
)

mem_cube = GeneralMemCube(config)


mos.register_mem_cube(f"/tmp/{user_name}", mem_cube_id=user_name)


with open("evaluation/data/hotpot/hotpot_dev_distractor_v1.json") as f:
data = json.load(f)


def build_context_text(context_list):
parts = []
for title, sentences in context_list:
text = " ".join(s.strip() for s in sentences if s.strip())
parts.append(f"{title}: {text}")
return "\n".join(parts)


def build_and_ask(item):
qid = item["_id"]
question = item["question"]

for title, sentences in item["context"]:
text = " ".join(s.strip() for s in sentences if s.strip())
memory_content = f"{title}: {text}"
mos.add(memory_content=memory_content)

answer = mos.chat(question).strip()
return qid, answer


pred_answers = {}

with ThreadPoolExecutor(max_workers=5) as executor:
futures = {executor.submit(build_and_ask, item): item for item in data}
for future in tqdm(as_completed(futures), total=len(futures)):
try:
qid, answer = future.result()
pred_answers[qid] = answer
except Exception as e:
print(f"Error: {e}")

predictions = {"answer": pred_answers, "sp": []}

with open("evaluation/data/hotpot/output/dev_distractor_pred.json", "w") as f:
json.dump(predictions, f, ensure_ascii=False, indent=2)
151 changes: 151 additions & 0 deletions evaluation/scripts/hotpot/hotpot_evaluate_v1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import re
import string
import sys

from collections import Counter

import ujson as json


def normalize_answer(s):
def remove_articles(text):
return re.sub(r"\b(a|an|the)\b", " ", text)

def white_space_fix(text):
return " ".join(text.split())

def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)

def lower(text):
return text.lower()

return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
normalized_prediction = normalize_answer(prediction)
normalized_ground_truth = normalize_answer(ground_truth)

zero_metric = (0, 0, 0)

if (
normalized_prediction in ["yes", "no", "noanswer"]
and normalized_prediction != normalized_ground_truth
):
return zero_metric
if (
normalized_ground_truth in ["yes", "no", "noanswer"]
and normalized_prediction != normalized_ground_truth
):
return zero_metric

prediction_tokens = normalized_prediction.split()
ground_truth_tokens = normalized_ground_truth.split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return zero_metric
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1, precision, recall


def exact_match_score(prediction, ground_truth):
return normalize_answer(prediction) == normalize_answer(ground_truth)


def update_answer(metrics, prediction, gold):
em = exact_match_score(prediction, gold)
f1, prec, recall = f1_score(prediction, gold)
metrics["em"] += float(em)
metrics["f1"] += f1
metrics["prec"] += prec
metrics["recall"] += recall
return em, prec, recall


def update_sp(metrics, prediction, gold):
cur_sp_pred = set(map(tuple, prediction))
gold_sp_pred = set(map(tuple, gold))
tp, fp, fn = 0, 0, 0
for e in cur_sp_pred:
if e in gold_sp_pred:
tp += 1
else:
fp += 1
for e in gold_sp_pred:
if e not in cur_sp_pred:
fn += 1
prec = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
f1 = 2 * prec * recall / (prec + recall) if prec + recall > 0 else 0.0
em = 1.0 if fp + fn == 0 else 0.0
metrics["sp_em"] += em
metrics["sp_f1"] += f1
metrics["sp_prec"] += prec
metrics["sp_recall"] += recall
return em, prec, recall


def eval(prediction_file, gold_file):
with open(prediction_file) as f:
prediction = json.load(f)
with open(gold_file) as f:
gold = json.load(f)

metrics = {
"em": 0,
"f1": 0,
"prec": 0,
"recall": 0,
"sp_em": 0,
"sp_f1": 0,
"sp_prec": 0,
"sp_recall": 0,
"joint_em": 0,
"joint_f1": 0,
"joint_prec": 0,
"joint_recall": 0,
}
for dp in gold:
cur_id = dp["_id"]
can_eval_joint = True
if cur_id not in prediction["answer"]:
print(f"missing answer {cur_id}")
can_eval_joint = False
else:
em, prec, recall = update_answer(metrics, prediction["answer"][cur_id], dp["answer"])
if cur_id not in prediction["sp"]:
print(f"missing sp fact {cur_id}")
can_eval_joint = False
else:
sp_em, sp_prec, sp_recall = update_sp(
metrics, prediction["sp"][cur_id], dp["supporting_facts"]
)

if can_eval_joint:
joint_prec = prec * sp_prec
joint_recall = recall * sp_recall
if joint_prec + joint_recall > 0:
joint_f1 = 2 * joint_prec * joint_recall / (joint_prec + joint_recall)
else:
joint_f1 = 0.0
joint_em = em * sp_em

metrics["joint_em"] += joint_em
metrics["joint_f1"] += joint_f1
metrics["joint_prec"] += joint_prec
metrics["joint_recall"] += joint_recall

n = len(gold)
for k in metrics:
metrics[k] /= n

print(metrics)


if __name__ == "__main__":
eval(sys.argv[1], sys.argv[2])
20 changes: 20 additions & 0 deletions evaluation/scripts/longbenchV2/import_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from datasets import load_dataset


dataset = load_dataset("zai-org/LongBench-v2", split="train")
print(dataset)


def truncate(value, max_len=200):
if isinstance(value, str) and len(value) > max_len:
return value[:max_len] + "... [TRUNCATED]"
return value


for i in range(10):
sample = dataset[i]
print(f"========== Sample {i} ==========")
for key, value in sample.items():
print(f"{key}: {truncate(value)}")

print("\n")
Empty file.
Loading