From fc187465534f46d120415939aaa20c56a759e772 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Thu, 18 Dec 2025 13:17:25 +0800 Subject: [PATCH 01/20] feat: multi omics KG building --- .../search/build_db/build_dna_blast_db.sh | 156 ++++-- .../search/build_db/build_protein_blast_db.sh | 74 ++- .../search/build_db/build_rna_blast_db.sh | 482 +++++++++++++----- examples/search/search_dna.sh | 4 +- examples/search/search_dna_config.yaml | 26 +- examples/search/search_protein_config.yaml | 26 +- examples/search/search_rna.sh | 4 +- examples/search/search_rna_config.yaml | 26 +- examples/search/search_uniprot.sh | 3 +- graphgen/bases/base_operator.py | 12 +- graphgen/bases/base_reader.py | 1 + graphgen/bases/base_searcher.py | 15 +- graphgen/engine.py | 34 +- graphgen/models/__init__.py | 3 +- graphgen/models/evaluator/length_evaluator.py | 6 +- graphgen/models/evaluator/mtld_evaluator.py | 6 +- graphgen/models/generator/__init__.py | 1 + graphgen/models/kg_builder/__init__.py | 1 + graphgen/models/llm/local/sglang_wrapper.py | 2 +- graphgen/models/llm/local/vllm_wrapper.py | 12 +- .../partitioner/anchor_bfs_partitioner.py | 79 ++- graphgen/models/reader/json_reader.py | 84 +-- graphgen/models/searcher/db/ncbi_searcher.py | 354 +++++++------ .../models/searcher/db/rnacentral_searcher.py | 164 ++++-- .../models/searcher/db/uniprot_searcher.py | 155 +++--- graphgen/models/storage/kv/json_storage.py | 76 ++- graphgen/operators/__init__.py | 6 +- .../operators/build_kg/build_kg_service.py | 29 +- graphgen/operators/chunk/chunk_service.py | 6 +- .../operators/generate/generate_service.py | 32 +- graphgen/operators/judge/judge_service.py | 4 +- .../operators/partition/partition_service.py | 28 +- graphgen/operators/quiz/quiz_service.py | 7 +- graphgen/operators/read/__init__.py | 2 +- graphgen/operators/read/read.py | 3 +- graphgen/operators/search/__init__.py | 2 +- graphgen/run.py | 132 ++--- graphgen/templates/__init__.py | 8 +- .../extraction/schema_guided_extraction.py | 2 +- graphgen/templates/generation/__init__.py | 1 + .../templates/generation/atomic_generation.py | 2 +- .../templates/generation/cot_generation.py | 4 +- graphgen/templates/kg/__init__.py | 1 + graphgen/templates/kg/mm_kg_extraction.py | 2 +- graphgen/templates/search_judgement.py | 2 +- graphgen/utils/run_concurrent.py | 94 +++- graphgen/utils/wrap.py | 12 +- 47 files changed, 1350 insertions(+), 835 deletions(-) diff --git a/examples/search/build_db/build_dna_blast_db.sh b/examples/search/build_db/build_dna_blast_db.sh index 1928d7d0..969ebbac 100755 --- a/examples/search/build_db/build_dna_blast_db.sh +++ b/examples/search/build_db/build_dna_blast_db.sh @@ -24,8 +24,8 @@ set -e # - {category}.{number}.genomic.fna.gz (基因组序列) # - {category}.{number}.rna.fna.gz (RNA序列) # -# Usage: ./build_dna_blast_db.sh [human_mouse|representative|complete|all] -# human_mouse: Download only Homo sapiens and Mus musculus sequences (minimal, smallest) +# Usage: ./build_dna_blast_db.sh [human_mouse_drosophila_yeast|representative|complete|all] +# human_mouse_drosophila_yeast: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae sequences (minimal, smallest) # representative: Download genomic sequences from major categories (recommended, smaller) # Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi # complete: Download all complete genomic sequences from complete/ directory (very large) @@ -36,7 +36,7 @@ set -e # For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ # Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ -DOWNLOAD_TYPE=${1:-human_mouse} +DOWNLOAD_TYPE=${1:-human_mouse_drosophila_yeast} # Better to use a stable DOWNLOAD_TMP name to support resuming downloads DOWNLOAD_TMP=_downloading_dna @@ -58,17 +58,35 @@ else echo "Using date as release identifier: ${RELEASE}" fi -# Function to check if a file contains target species -check_file_for_species() { - local url=$1 - local filename=$2 - local temp_file="/tmp/check_${filename//\//_}" + # First check if file is already downloaded locally + if check_file_downloaded "${filename}"; then + # File already exists, check if it contains target species + # Check both compressed and decompressed versions + local decompressed_file="${filename%.gz}" + if [ -f "${filename}" ]; then + # Compressed file exists + if gunzip -c "${filename}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then + return 0 # Contains target species + else + return 1 # Does not contain target species + fi + elif [ -f "${decompressed_file}" ]; then + # Decompressed file exists + if head -2000 "${decompressed_file}" 2>/dev/null | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then + return 0 # Contains target species + else + return 1 # Does not contain target species + fi + fi + fi + # File not downloaded yet, download first 500KB to check # Download first 500KB (enough to get many sequence headers) # This should be sufficient to identify the species in most cases if curl -s --max-time 30 --range 0-512000 "${url}" -o "${temp_file}" 2>/dev/null && [ -s "${temp_file}" ]; then # Try to decompress and check for species names - if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus)"; then + # Check for: Homo sapiens (人), Mus musculus (小鼠), Drosophila melanogaster (果蝇), Saccharomyces cerevisiae (酵母) + if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then rm -f "${temp_file}" return 0 # Contains target species else @@ -84,39 +102,57 @@ check_file_for_species() { # Download based on type case ${DOWNLOAD_TYPE} in - human_mouse) - echo "Downloading RefSeq sequences for Homo sapiens and Mus musculus only (minimal size)..." - echo "This will check each file to see if it contains human or mouse sequences..." - category="vertebrate_mammalian" - echo "Checking files in ${category} category..." - - # Get list of files and save to temp file to avoid subshell issues - curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ - grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ - sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files.txt + human_mouse_drosophila_yeast) + echo "Downloading RefSeq sequences for Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal size)..." + echo "This will check each file to see if it contains target species sequences..." - file_count=0 - download_count=0 + # Check multiple categories: vertebrate_mammalian (人、小鼠), invertebrate (果蝇), fungi (酵母) + categories="vertebrate_mammalian invertebrate fungi" + total_file_count=0 + total_download_count=0 - while read filename; do - file_count=$((file_count + 1)) - url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" - echo -n "[${file_count}] Checking ${filename}... " + for category in ${categories}; do + echo "Checking files in ${category} category..." - if check_file_for_species "${url}" "${filename}"; then - echo "✓ contains target species, downloading..." - download_count=$((download_count + 1)) - wget -c -q --show-progress "${url}" || { - echo "Warning: Failed to download ${filename}" - } - else - echo "✗ skipping (no human/mouse data)" - fi - done < /tmp/refseq_files.txt + # Get list of files and save to temp file to avoid subshell issues + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ + grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt + + file_count=0 + download_count=0 + + while read filename; do + file_count=$((file_count + 1)) + total_file_count=$((total_file_count + 1)) + url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" + echo -n "[${total_file_count}] Checking ${category}/${filename}... " + + if check_file_for_species "${url}" "${filename}"; then + # Check if file is already downloaded + if check_file_downloaded "${filename}"; then + echo "✓ already downloaded (contains target species)" + download_count=$((download_count + 1)) + total_download_count=$((total_download_count + 1)) + else + echo "✓ contains target species, downloading..." + download_count=$((download_count + 1)) + total_download_count=$((total_download_count + 1)) + wget -c -q --show-progress "${url}" || { + echo "Warning: Failed to download ${filename}" + } + fi + else + echo "✗ skipping (no target species data)" + fi + done < /tmp/refseq_files_${category}.txt + + rm -f /tmp/refseq_files_${category}.txt + echo " ${category}: Checked ${file_count} files, downloaded ${download_count} files." + done - rm -f /tmp/refseq_files.txt echo "" - echo "Summary: Checked ${file_count} files, downloaded ${download_count} files containing human or mouse sequences." + echo "Summary: Checked ${total_file_count} files total, downloaded ${total_download_count} files containing target species (human, mouse, fruit fly, yeast)." ;; representative) echo "Downloading RefSeq representative sequences (recommended, smaller size)..." @@ -124,52 +160,76 @@ case ${DOWNLOAD_TYPE} in # Note: You can modify this list based on your specific requirements for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi; do echo "Downloading ${category} sequences..." + # Get list of files and save to temp file to avoid subshell issues curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ - sed 's/href="\(.*\)"/\1/' | \ - while read filename; do + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt + + while read filename; do + if check_file_downloaded "${filename}"; then + echo " ✓ ${filename} already downloaded, skipping..." + else echo " Downloading ${filename}..." wget -c -q --show-progress \ "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { echo "Warning: Failed to download ${filename}" } - done + fi + done < /tmp/refseq_files_${category}.txt + + rm -f /tmp/refseq_files_${category}.txt done ;; complete) echo "Downloading RefSeq complete genomic sequences (WARNING: very large, may take hours)..." + # Get list of files and save to temp file to avoid subshell issues curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \ grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ - sed 's/href="\(.*\)"/\1/' | \ - while read filename; do + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_complete.txt + + while read filename; do + if check_file_downloaded "${filename}"; then + echo " ✓ ${filename} already downloaded, skipping..." + else echo " Downloading ${filename}..." wget -c -q --show-progress \ "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename}" || { echo "Warning: Failed to download ${filename}" } - done + fi + done < /tmp/refseq_files_complete.txt + + rm -f /tmp/refseq_files_complete.txt ;; all) echo "Downloading all RefSeq genomic sequences from all categories (WARNING: extremely large, may take many hours)..." # Download genomic sequences from all categories for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do echo "Downloading ${category} genomic sequences..." + # Get list of files and save to temp file to avoid subshell issues curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ - sed 's/href="\(.*\)"/\1/' | \ - while read filename; do + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt + + while read filename; do + if check_file_downloaded "${filename}"; then + echo " ✓ ${filename} already downloaded, skipping..." + else echo " Downloading ${filename}..." wget -c -q --show-progress \ "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { echo "Warning: Failed to download ${filename}" } - done + fi + done < /tmp/refseq_files_${category}.txt + + rm -f /tmp/refseq_files_${category}.txt done ;; *) echo "Error: Unknown download type '${DOWNLOAD_TYPE}'" - echo "Usage: $0 [human_mouse|representative|complete|all]" - echo " human_mouse: Download only Homo sapiens and Mus musculus (minimal)" + echo "Usage: $0 [human_mouse_drosophila_yeast|representative|complete|all]" + echo " human_mouse_drosophila_yeast: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal)" echo " representative: Download major categories (recommended)" echo " complete: Download all complete genomic sequences (very large)" echo " all: Download all genomic sequences (extremely large)" diff --git a/examples/search/build_db/build_protein_blast_db.sh b/examples/search/build_db/build_protein_blast_db.sh index 9292875a..da4c2b4b 100755 --- a/examples/search/build_db/build_protein_blast_db.sh +++ b/examples/search/build_db/build_protein_blast_db.sh @@ -9,48 +9,78 @@ set -e # For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ # Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ -# Better to use a stable DOWNLOAD_TMP name to support resuming downloads -DOWNLOAD_TMP=_downloading -mkdir -p ${DOWNLOAD_TMP} -cd ${DOWNLOAD_TMP} - -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/RELEASE.metalink" +echo "Downloading RELEASE.metalink..." +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/RELEASE.metalink" # Extract the release name (like 2017_10 or 2017_1) # Use sed for cross-platform compatibility (works on both macOS and Linux) RELEASE=$(sed -n 's/.*\([0-9]\{4\}_[0-9]\{1,2\}\)<\/version>.*/\1/p' RELEASE.metalink | head -1) -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt" -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/README" -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/LICENSE" +echo "UniProt release: ${RELEASE}" +echo "" + +# Download Swiss-Prot (always needed) +echo "Downloading uniprot_sprot.fasta.gz..." +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" + +# Download TrEMBL only if full mode +if [ "${DOWNLOAD_MODE}" = "full" ]; then + echo "Downloading uniprot_trembl.fasta.gz..." + wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" +fi + +# Download metadata files +echo "Downloading metadata files..." +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/reldate.txt" +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/README" +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/LICENSE" cd .. -mkdir ${RELEASE} +mkdir -p ${RELEASE} mv ${DOWNLOAD_TMP}/* ${RELEASE} rmdir ${DOWNLOAD_TMP} cd ${RELEASE} +echo "" +echo "Extracting files..." gunzip uniprot_sprot.fasta.gz -gunzip uniprot_trembl.fasta.gz -cat uniprot_sprot.fasta uniprot_trembl.fasta >uniprot_${RELEASE}.fasta +if [ "${DOWNLOAD_MODE}" = "full" ]; then + gunzip uniprot_trembl.fasta.gz + echo "Merging Swiss-Prot and TrEMBL..." + cat uniprot_sprot.fasta uniprot_trembl.fasta >uniprot_${RELEASE}.fasta +fi -makeblastdb -in uniprot_${RELEASE}.fasta -out uniprot_${RELEASE} -dbtype prot -parse_seqids -title uniprot_${RELEASE} +echo "" +echo "Building BLAST databases..." + +# Always build Swiss-Prot database makeblastdb -in uniprot_sprot.fasta -out uniprot_sprot -dbtype prot -parse_seqids -title uniprot_sprot -makeblastdb -in uniprot_trembl.fasta -out uniprot_trembl -dbtype prot -parse_seqids -title uniprot_trembl + +# Build full release database only if in full mode +if [ "${DOWNLOAD_MODE}" = "full" ]; then + makeblastdb -in uniprot_${RELEASE}.fasta -out uniprot_${RELEASE} -dbtype prot -parse_seqids -title uniprot_${RELEASE} + makeblastdb -in uniprot_trembl.fasta -out uniprot_trembl -dbtype prot -parse_seqids -title uniprot_trembl +fi cd .. +echo "" echo "BLAST databases created successfully!" echo "Database locations:" -echo " - Combined: $(pwd)/${RELEASE}/uniprot_${RELEASE}" -echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot" -echo " - TrEMBL: $(pwd)/${RELEASE}/uniprot_trembl" -echo "" -echo "To use these databases, set in your config:" -echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot # or uniprot_${RELEASE} or uniprot_trembl" +if [ "${DOWNLOAD_MODE}" = "sprot" ]; then + echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot" + echo "" + echo "To use this database, set in your config:" + echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot" +else + echo " - Combined: $(pwd)/${RELEASE}/uniprot_${RELEASE}" + echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot" + echo " - TrEMBL: $(pwd)/${RELEASE}/uniprot_trembl" + echo "" + echo "To use these databases, set in your config:" + echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot # or uniprot_${RELEASE} or uniprot_trembl" +fi diff --git a/examples/search/build_db/build_rna_blast_db.sh b/examples/search/build_db/build_rna_blast_db.sh index 26e1cd33..af688ac1 100755 --- a/examples/search/build_db/build_rna_blast_db.sh +++ b/examples/search/build_db/build_rna_blast_db.sh @@ -10,16 +10,20 @@ set -e # RNAcentral is a comprehensive database of non-coding RNA sequences that # integrates data from multiple expert databases including RefSeq, Rfam, etc. # -# Usage: ./build_rna_blast_db.sh [all|list|database_name] +# Usage: ./build_rna_blast_db.sh [all|list|selected|database_name...] # all (default): Download complete active database (~8.4G compressed) # list: List all available database subsets +# selected: Download predefined database subsets (ensembl_gencode, mirbase, gtrnadb, refseq, lncbase) # database_name: Download specific database subset (e.g., refseq, rfam, mirbase) +# database_name1 database_name2 ...: Download multiple database subsets # # Available database subsets (examples): # - refseq.fasta (~98M): RefSeq RNA sequences # - rfam.fasta (~1.5G): Rfam RNA families # - mirbase.fasta (~10M): microRNA sequences -# - ensembl.fasta (~2.9G): Ensembl annotations +# - ensembl_gencode.fasta (~337M): Ensembl/GENCODE annotations (human) +# - gtrnadb.fasta (~38M): tRNA sequences +# - lncbase.fasta (~106K): Human lncRNA database # - See "list" option for complete list # # The complete "active" database contains all sequences from all expert databases. @@ -30,20 +34,24 @@ set -e # For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ # Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ -# RNAcentral HTTP base URL (using HTTPS for better reliability) +# RNAcentral base URL (using EBI HTTPS) +# NOTE: RNAcentral only has one official mirror at EBI RNACENTRAL_BASE="https://ftp.ebi.ac.uk/pub/databases/RNAcentral" RNACENTRAL_RELEASE_URL="${RNACENTRAL_BASE}/current_release" RNACENTRAL_SEQUENCES_URL="${RNACENTRAL_RELEASE_URL}/sequences" RNACENTRAL_BY_DB_URL="${RNACENTRAL_SEQUENCES_URL}/by-database" -# Parse command line argument -DB_SELECTION=${1:-all} +# Parse command line arguments +DB_SELECTION=${1:-selected} + +# Predefined database list for "selected" option +SELECTED_DATABASES=("ensembl_gencode" "mirbase" "gtrnadb" "refseq" "lncbase" "rfam") # List available databases if requested if [ "${DB_SELECTION}" = "list" ]; then echo "Available RNAcentral database subsets:" echo "" - echo "Fetching list from RNAcentral FTP..." + echo "Fetching list from RNAcentral..." listing=$(curl -s "${RNACENTRAL_BY_DB_URL}/") echo "${listing}" | \ grep -oE '' | \ @@ -54,30 +62,41 @@ if [ "${DB_SELECTION}" = "list" ]; then echo " - ${db%.fasta}: ${size}" done echo "" - echo "Usage: $0 [database_name]" + echo "Usage: $0 [all|list|selected|database_name...]" echo " Example: $0 refseq # Download only RefSeq sequences (~98M)" echo " Example: $0 rfam # Download only Rfam sequences (~1.5G)" + echo " Example: $0 selected # Download predefined databases (ensembl_gencode, mirbase, gtrnadb, refseq, lncbase, rfam)" + echo " Example: $0 refseq mirbase # Download multiple databases" echo " Example: $0 all # Download complete active database (~8.4G)" exit 0 fi -# Better to use a stable DOWNLOAD_TMP name to support resuming downloads -DOWNLOAD_TMP=_downloading_rnacentral -mkdir -p ${DOWNLOAD_TMP} -cd ${DOWNLOAD_TMP} +# Determine which databases to download +if [ "${DB_SELECTION}" = "selected" ]; then + # Use predefined database list + DATABASES=("${SELECTED_DATABASES[@]}") + echo "Downloading selected databases: ${DATABASES[*]}" +elif [ "${DB_SELECTION}" = "all" ]; then + # Single database mode (all) + DATABASES=("all") +else + # Multiple databases provided as arguments + DATABASES=("$@") +fi -# Get RNAcentral release version from release notes +# Get RNAcentral release version from release notes (once for all databases) echo "Getting RNAcentral release information..." RELEASE_NOTES_URL="${RNACENTRAL_RELEASE_URL}/release_notes.txt" -RELEASE_NOTES="release_notes.txt" -wget -q "${RELEASE_NOTES_URL}" 2>/dev/null || { +RELEASE_NOTES_TMP=$(mktemp) +wget -q "${RELEASE_NOTES_URL}" -O "${RELEASE_NOTES_TMP}" 2>/dev/null || { echo "Warning: Could not download release notes, using current date as release identifier" RELEASE=$(date +%Y%m%d) } -if [ -f "${RELEASE_NOTES}" ]; then +if [ -f "${RELEASE_NOTES_TMP}" ] && [ -s "${RELEASE_NOTES_TMP}" ]; then # Try to extract version from release notes (first line usually contains version info) - RELEASE=$(head -1 "${RELEASE_NOTES}" | grep -oE '[0-9]+\.[0-9]+' | head -1 | tr -d '.') + RELEASE=$(head -1 "${RELEASE_NOTES_TMP}" | grep -oE '[0-9]+\.[0-9]+' | head -1 | tr -d '.') + rm -f "${RELEASE_NOTES_TMP}" fi if [ -z "${RELEASE}" ]; then @@ -87,133 +106,328 @@ else echo "RNAcentral release: ${RELEASE}" fi -# Download RNAcentral FASTA file -if [ "${DB_SELECTION}" = "all" ]; then - # Download complete active database - FASTA_FILE="rnacentral_active.fasta.gz" - DB_NAME="rnacentral" - echo "Downloading RNAcentral active sequences (~8.4G)..." - echo " Contains sequences currently present in at least one expert database" - echo " Uses standard URS IDs (e.g., URS000149A9AF)" - echo " ⭐ MATCHES the online RNAcentral API database - ensures consistency" - FASTA_URL="${RNACENTRAL_SEQUENCES_URL}/${FASTA_FILE}" - IS_COMPRESSED=true -else - # Download specific database subset - DB_NAME="${DB_SELECTION}" - FASTA_FILE="${DB_SELECTION}.fasta" - echo "Downloading RNAcentral database subset: ${DB_SELECTION}" - echo " This is a subset of the active database from a specific expert database" - echo " File: ${FASTA_FILE}" - FASTA_URL="${RNACENTRAL_BY_DB_URL}/${FASTA_FILE}" - IS_COMPRESSED=false - - # Check if database exists - if ! curl -s -o /dev/null -w "%{http_code}" "${FASTA_URL}" | grep -q "200"; then - echo "Error: Database '${DB_SELECTION}' not found" - echo "Run '$0 list' to see available databases" +# Process each database +DB_COUNT=${#DATABASES[@]} +DB_INDEX=0 + +for DB_SELECTION in "${DATABASES[@]}"; do + DB_INDEX=$((DB_INDEX + 1)) + echo "" + echo "==========================================" + echo "Processing database ${DB_INDEX}/${DB_COUNT}: ${DB_SELECTION}" + echo "==========================================" + echo "" + + # Check if database already exists and is complete + # First check with current release version + if [ "${DB_SELECTION}" = "all" ]; then + OUTPUT_DIR="rnacentral_${RELEASE}" + DB_NAME="rnacentral" + DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" + else + OUTPUT_DIR="rnacentral_${DB_SELECTION}_${RELEASE}" + DB_NAME="${DB_SELECTION}" + DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" + fi + + # Check if BLAST database already exists with current release + if [ -d "${OUTPUT_DIR}" ] && [ -f "${OUTPUT_DIR}/${DB_OUTPUT_NAME}.nhr" ] && [ -f "${OUTPUT_DIR}/${DB_OUTPUT_NAME}.nin" ]; then + echo "✓ Database ${DB_SELECTION} already exists and appears complete: ${OUTPUT_DIR}/" + echo " BLAST database: ${OUTPUT_DIR}/${DB_OUTPUT_NAME}" + echo " Skipping download and database creation..." + continue + fi + + # Also check for any existing version of this database (e.g., different release dates) + EXISTING_DIR=$(ls -d rnacentral_${DB_SELECTION}_* 2>/dev/null | head -1) + if [ -n "${EXISTING_DIR}" ] && [ "${DB_SELECTION}" != "all" ]; then + EXISTING_DB_NAME=$(basename "${EXISTING_DIR}" | sed "s/rnacentral_${DB_SELECTION}_//") + if [ -f "${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}.nhr" ] && [ -f "${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}.nin" ]; then + echo "✓ Database ${DB_SELECTION} already exists (version ${EXISTING_DB_NAME}): ${EXISTING_DIR}/" + echo " BLAST database: ${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}" + echo " Skipping download and database creation..." + echo " Note: Using existing version ${EXISTING_DB_NAME} instead of ${RELEASE}" + continue + fi + fi + + # Better to use a stable DOWNLOAD_TMP name to support resuming downloads + DOWNLOAD_TMP="_downloading_rnacentral_${DB_SELECTION}" + mkdir -p ${DOWNLOAD_TMP} + cd ${DOWNLOAD_TMP} + + # Download RNAcentral FASTA file + if [ "${DB_SELECTION}" = "all" ]; then + # Download complete active database + FASTA_FILE="rnacentral_active.fasta.gz" + DB_NAME="rnacentral" + echo "Downloading RNAcentral active sequences (~8.4G)..." + echo " Contains sequences currently present in at least one expert database" + echo " Uses standard URS IDs (e.g., URS000149A9AF)" + echo " ⭐ MATCHES the online RNAcentral API database - ensures consistency" + FASTA_URL="${RNACENTRAL_SEQUENCES_URL}/${FASTA_FILE}" + IS_COMPRESSED=true + else + # Download specific database subset + DB_NAME="${DB_SELECTION}" + FASTA_FILE="${DB_SELECTION}.fasta" + echo "Downloading RNAcentral database subset: ${DB_SELECTION}" + echo " This is a subset of the active database from a specific expert database" + echo " File: ${FASTA_FILE}" + FASTA_URL="${RNACENTRAL_BY_DB_URL}/${FASTA_FILE}" + IS_COMPRESSED=false + + # Check if database exists (use HTTP status code check for HTTPS) + HTTP_CODE=$(curl -s --max-time 10 -o /dev/null -w "%{http_code}" "${FASTA_URL}" 2>/dev/null | tail -1 || echo "000") + if ! echo "${HTTP_CODE}" | grep -q "^200$"; then + echo "Error: Database '${DB_SELECTION}' not found (HTTP code: ${HTTP_CODE})" + echo "Run '$0 list' to see available databases" + cd .. + rm -rf ${DOWNLOAD_TMP} + exit 1 + fi + fi + + echo "Downloading from: ${FASTA_URL}" + echo "This may take a while depending on your internet connection..." + if [ "${DB_SELECTION}" = "all" ]; then + echo "File size is approximately 8-9GB, please be patient..." + else + echo "Downloading database subset..." + fi + + wget -c "${FASTA_URL}" || { + echo "Error: Failed to download RNAcentral FASTA file" + echo "Please check your internet connection and try again" + echo "URL: ${FASTA_URL}" + cd .. + rm -rf ${DOWNLOAD_TMP} + exit 1 + } + + if [ ! -f "${FASTA_FILE}" ]; then + echo "Error: Downloaded file not found" + cd .. + rm -rf ${DOWNLOAD_TMP} exit 1 fi -fi - -echo "Downloading from: ${FASTA_URL}" -echo "This may take a while depending on your internet connection..." -if [ "${DB_SELECTION}" = "all" ]; then - echo "File size is approximately 8-9GB, please be patient..." -else - echo "Downloading database subset..." -fi -wget -c --progress=bar:force "${FASTA_URL}" 2>&1 || { - echo "Error: Failed to download RNAcentral FASTA file" - echo "Please check your internet connection and try again" - echo "You can also try downloading manually from: ${FASTA_URL}" - exit 1 -} - -if [ ! -f "${FASTA_FILE}" ]; then - echo "Error: Downloaded file not found" - exit 1 -fi + + cd .. + + # Create release directory + if [ "${DB_SELECTION}" = "all" ]; then + OUTPUT_DIR="rnacentral_${RELEASE}" + else + OUTPUT_DIR="rnacentral_${DB_NAME}_${RELEASE}" + fi + mkdir -p ${OUTPUT_DIR} + mv ${DOWNLOAD_TMP}/* ${OUTPUT_DIR}/ 2>/dev/null || true + rmdir ${DOWNLOAD_TMP} 2>/dev/null || true + + cd ${OUTPUT_DIR} + + # Extract FASTA file if compressed + echo "Preparing RNAcentral sequences..." + if [ -f "${FASTA_FILE}" ]; then + if [ "${IS_COMPRESSED}" = "true" ]; then + echo "Decompressing ${FASTA_FILE}..." + OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" + gunzip -c "${FASTA_FILE}" > "${OUTPUT_FASTA}" || { + echo "Error: Failed to decompress FASTA file" + cd .. + exit 1 + } + # Optionally remove the compressed file to save space + # rm "${FASTA_FILE}" + else + # File is not compressed, just copy/rename + OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" + cp "${FASTA_FILE}" "${OUTPUT_FASTA}" || { + echo "Error: Failed to copy FASTA file" + cd .. + exit 1 + } + fi + else + echo "Error: FASTA file not found" + cd .. + exit 1 + fi + + # Check if we have sequences + if [ ! -s "${OUTPUT_FASTA}" ]; then + echo "Error: FASTA file is empty" + cd .. + exit 1 + fi + + # Get file size for user information + FILE_SIZE=$(du -h "${OUTPUT_FASTA}" | cut -f1) + echo "FASTA file size: ${FILE_SIZE}" + + echo "Creating BLAST database..." + # Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide) + # Note: RNAcentral uses RNAcentral IDs (URS...) as sequence identifiers, + # which matches the format expected by the RNACentralSearch class + DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" + makeblastdb -in "${OUTPUT_FASTA}" \ + -out "${DB_OUTPUT_NAME}" \ + -dbtype nucl \ + -parse_seqids \ + -title "RNAcentral_${DB_NAME}_${RELEASE}" + + echo "" + echo "BLAST database created successfully!" + echo "Database location: $(pwd)/${DB_OUTPUT_NAME}" + echo "" + echo "To use this database, set in your config (search_rna_config.yaml):" + echo " rnacentral_params:" + echo " use_local_blast: true" + echo " local_blast_db: $(pwd)/${DB_OUTPUT_NAME}" + echo "" + echo "Note: The database files are:" + ls -lh ${DB_OUTPUT_NAME}.* | head -5 + echo "" + if [ "${DB_SELECTION}" = "all" ]; then + echo "This database uses RNAcentral IDs (URS...), which matches the online" + echo "RNAcentral search API, ensuring consistent results between local and online searches." + else + echo "This is a subset database from ${DB_SELECTION} expert database." + echo "For full coverage matching online API, use 'all' option." + fi + + cd .. +done -cd .. +echo "" +echo "==========================================" +echo "All databases processed successfully!" +echo "==========================================" +echo "" -# Create release directory -if [ "${DB_SELECTION}" = "all" ]; then - OUTPUT_DIR="rnacentral_${RELEASE}" -else - OUTPUT_DIR="rnacentral_${DB_NAME}_${RELEASE}" -fi -mkdir -p ${OUTPUT_DIR} -mv ${DOWNLOAD_TMP}/* ${OUTPUT_DIR}/ 2>/dev/null || true -rmdir ${DOWNLOAD_TMP} 2>/dev/null || true - -cd ${OUTPUT_DIR} - -# Extract FASTA file if compressed -echo "Preparing RNAcentral sequences..." -if [ -f "${FASTA_FILE}" ]; then - if [ "${IS_COMPRESSED}" = "true" ]; then - echo "Decompressing ${FASTA_FILE}..." - OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" - gunzip -c "${FASTA_FILE}" > "${OUTPUT_FASTA}" || { - echo "Error: Failed to decompress FASTA file" - exit 1 +# If multiple databases were downloaded, offer to merge them +if [ ${#DATABASES[@]} -gt 1 ] && [ "${DATABASES[0]}" != "all" ]; then + echo "Multiple databases downloaded. Creating merged database for unified search..." + MERGED_DIR="rnacentral_merged_${RELEASE}" + mkdir -p ${MERGED_DIR} + cd ${MERGED_DIR} + + MERGED_FASTA="rnacentral_merged_${RELEASE}.fasta" + MERGED_FASTA_TMP="${MERGED_FASTA}.tmp" + echo "Combining FASTA files from all databases..." + echo " Note: Duplicate sequence IDs will be removed (keeping first occurrence)..." + + # Combine all FASTA files into a temporary file + # Find actual database directories (may have different release versions) + FOUND_ANY=false + for DB_SELECTION in "${DATABASES[@]}"; do + [ "${DB_SELECTION}" = "all" ] && continue + + # Try current release version first, then search for any existing version + OUTPUT_FASTA="../rnacentral_${DB_SELECTION}_${RELEASE}/${DB_SELECTION}_${RELEASE}.fasta" + [ ! -f "${OUTPUT_FASTA}" ] && { + EXISTING_DIR=$(ls -d ../rnacentral_${DB_SELECTION}_* 2>/dev/null | head -1) + [ -n "${EXISTING_DIR}" ] && { + EXISTING_VERSION=$(basename "${EXISTING_DIR}" | sed "s/rnacentral_${DB_SELECTION}_//") + OUTPUT_FASTA="${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_VERSION}.fasta" + } } - # Optionally remove the compressed file to save space - # rm "${FASTA_FILE}" - else - # File is not compressed, just copy/rename - OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" - cp "${FASTA_FILE}" "${OUTPUT_FASTA}" || { - echo "Error: Failed to copy FASTA file" - exit 1 + + if [ -f "${OUTPUT_FASTA}" ]; then + echo " Adding ${DB_SELECTION} sequences..." + cat "${OUTPUT_FASTA}" >> "${MERGED_FASTA_TMP}" + FOUND_ANY=true + else + echo " Warning: Could not find FASTA file for ${DB_SELECTION}" + fi + done + + # Validate that we have files to merge + if [ "${FOUND_ANY}" = "false" ] || [ ! -s "${MERGED_FASTA_TMP}" ]; then + echo "Error: No FASTA files found to merge" + cd .. + rm -rf ${MERGED_DIR} + exit 1 + fi + + # Remove duplicates based on sequence ID (keeping first occurrence) + echo " Removing duplicate sequence IDs..." + awk ' + /^>/ { + # Process previous sequence if we have one + if (current_id != "" && !seen[current_id]) { + print current_header ORS current_seq + seen[current_id] = 1 + } + # Start new sequence + current_header = $0 + current_id = substr($0, 2) + sub(/[ \t].*/, "", current_id) # Extract ID up to first space/tab + current_seq = "" + next + } + { + # Accumulate sequence data by concatenating lines + current_seq = current_seq $0 + } + END { + # Process last sequence + if (current_id != "" && !seen[current_id]) { + print current_header ORS current_seq } + } + ' "${MERGED_FASTA_TMP}" > "${MERGED_FASTA}" + rm -f "${MERGED_FASTA_TMP}" + + # Check if merged file was created and has content + if [ ! -s "${MERGED_FASTA}" ]; then + echo "Warning: Merged FASTA file is empty or not created" + cd .. + rm -rf ${MERGED_DIR} + else + FILE_SIZE=$(du -h "${MERGED_FASTA}" | cut -f1) + echo "Merged FASTA file size: ${FILE_SIZE}" + + echo "Creating merged BLAST database..." + MERGED_DB_NAME="rnacentral_merged_${RELEASE}" + makeblastdb -in "${MERGED_FASTA}" \ + -out "${MERGED_DB_NAME}" \ + -dbtype nucl \ + -parse_seqids \ + -title "RNAcentral_Merged_${RELEASE}" + + echo "" + echo "✓ Merged BLAST database created successfully!" + echo "Database location: $(pwd)/${MERGED_DB_NAME}" + echo "" + echo "To use the merged database, set in your config (search_rna_config.yaml):" + echo " rnacentral_params:" + echo " use_local_blast: true" + echo " local_blast_db: $(pwd)/${MERGED_DB_NAME}" + echo "" + echo "Note: The merged database includes: ${DATABASES[*]}" + cd .. fi -else - echo "Error: FASTA file not found" - exit 1 fi -# Check if we have sequences -if [ ! -s "${OUTPUT_FASTA}" ]; then - echo "Error: FASTA file is empty" - exit 1 -fi - -# Get file size for user information -FILE_SIZE=$(du -h "${OUTPUT_FASTA}" | cut -f1) -echo "FASTA file size: ${FILE_SIZE}" - -echo "Creating BLAST database..." -# Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide) -# Note: RNAcentral uses RNAcentral IDs (URS...) as sequence identifiers, -# which matches the format expected by the RNACentralSearch class -DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" -makeblastdb -in "${OUTPUT_FASTA}" \ - -out "${DB_OUTPUT_NAME}" \ - -dbtype nucl \ - -parse_seqids \ - -title "RNAcentral_${DB_NAME}_${RELEASE}" - echo "" -echo "BLAST database created successfully!" -echo "Database location: $(pwd)/${DB_OUTPUT_NAME}" -echo "" -echo "To use this database, set in your config (search_rna_config.yaml):" -echo " rnacentral_params:" -echo " use_local_blast: true" -echo " local_blast_db: $(pwd)/${DB_OUTPUT_NAME}" -echo "" -echo "Note: The database files are:" -ls -lh ${DB_OUTPUT_NAME}.* | head -5 -echo "" -if [ "${DB_SELECTION}" = "all" ]; then - echo "This database uses RNAcentral IDs (URS...), which matches the online" - echo "RNAcentral search API, ensuring consistent results between local and online searches." -else - echo "This is a subset database from ${DB_SELECTION} expert database." - echo "For full coverage matching online API, use 'all' option." -fi +echo "Summary of downloaded databases:" +for DB_SELECTION in "${DATABASES[@]}"; do + if [ "${DB_SELECTION}" = "all" ]; then + OUTPUT_DIR="rnacentral_${RELEASE}" + DB_NAME="rnacentral" + else + OUTPUT_DIR="rnacentral_${DB_SELECTION}_${RELEASE}" + DB_NAME="${DB_SELECTION}" + fi + if [ -d "${OUTPUT_DIR}" ]; then + echo " - ${DB_NAME}: ${OUTPUT_DIR}/" + fi +done -cd .. +if [ -d "rnacentral_merged_${RELEASE}" ]; then + echo " - merged (all databases): rnacentral_merged_${RELEASE}/" + echo "" + echo "💡 Recommendation: Use the merged database for searching across all databases." +fi diff --git a/examples/search/search_dna.sh b/examples/search/search_dna.sh index d3c0d6ec..e05ab751 100644 --- a/examples/search/search_dna.sh +++ b/examples/search/search_dna.sh @@ -1,2 +1,4 @@ python3 -m graphgen.run \ ---config_file graphgen/configs/search_dna_config.yaml +--config_file examples/search/search_dna_config.yaml \ +--output_dir cache/ + diff --git a/examples/search/search_dna_config.yaml b/examples/search/search_dna_config.yaml index f53a5eb8..adbe7e1c 100644 --- a/examples/search/search_dna_config.yaml +++ b/examples/search/search_dna_config.yaml @@ -1,12 +1,23 @@ -pipeline: - - name: read_step - op_key: read +global_params: + working_dir: cache + +nodes: + - id: read_step + op_name: read + type: source + dependencies: [] params: - input_file: resources/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples + input_path: + - examples/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples - - name: search_step - op_key: search - deps: [read_step] # search_step depends on read_step + - id: search_step + op_name: search + type: map_batch + dependencies: + - read_step # search_step depends on read_step + execution_params: + replicas: 1 + batch_size: 10 params: data_sources: [ncbi] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral ncbi_params: @@ -15,3 +26,4 @@ pipeline: use_local_blast: true # whether to use local blast for DNA search local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension) + diff --git a/examples/search/search_protein_config.yaml b/examples/search/search_protein_config.yaml index bfbf84eb..f73a4514 100644 --- a/examples/search/search_protein_config.yaml +++ b/examples/search/search_protein_config.yaml @@ -1,15 +1,27 @@ -pipeline: - - name: read_step - op_key: read +global_params: + working_dir: cache + +nodes: + - id: read_step + op_name: read + type: source + dependencies: [] params: - input_file: resources/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples + input_path: + - examples/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples - - name: search_step - op_key: search - deps: [read_step] # search_step depends on read_step + - id: search_step + op_name: search + type: map_batch + dependencies: + - read_step # search_step depends on read_step + execution_params: + replicas: 1 + batch_size: 10 params: data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot uniprot_params: use_local_blast: true # whether to use local blast for uniprot search local_blast_db: /your_path/2024_01/uniprot_sprot # format: /path/to/${RELEASE}/uniprot_sprot # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database) + diff --git a/examples/search/search_rna.sh b/examples/search/search_rna.sh index 9243d6be..2bf5a406 100644 --- a/examples/search/search_rna.sh +++ b/examples/search/search_rna.sh @@ -1,2 +1,4 @@ python3 -m graphgen.run \ ---config_file graphgen/configs/search_rna_config.yaml +--config_file examples/search/search_rna_config.yaml \ +--output_dir cache/ + diff --git a/examples/search/search_rna_config.yaml b/examples/search/search_rna_config.yaml index 10422988..5b0e825e 100644 --- a/examples/search/search_rna_config.yaml +++ b/examples/search/search_rna_config.yaml @@ -1,14 +1,26 @@ -pipeline: - - name: read_step - op_key: read +global_params: + working_dir: cache + +nodes: + - id: read_step + op_name: read + type: source + dependencies: [] params: - input_file: resources/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples + input_path: + - examples/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples - - name: search_step - op_key: search - deps: [read_step] # search_step depends on read_step + - id: search_step + op_name: search + type: map_batch + dependencies: + - read_step # search_step depends on read_step + execution_params: + replicas: 1 + batch_size: 10 params: data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral rnacentral_params: use_local_blast: true # whether to use local blast for RNA search local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension) + diff --git a/examples/search/search_uniprot.sh b/examples/search/search_uniprot.sh index 8cb666c0..fbb33d70 100644 --- a/examples/search/search_uniprot.sh +++ b/examples/search/search_uniprot.sh @@ -1,2 +1,3 @@ python3 -m graphgen.run \ ---config_file graphgen/configs/search_protein_config.yaml +--config_file examples/search/search_protein_config.yaml \ +--output_dir cache/ diff --git a/graphgen/bases/base_operator.py b/graphgen/bases/base_operator.py index 300d3178..9ccb8bf2 100644 --- a/graphgen/bases/base_operator.py +++ b/graphgen/bases/base_operator.py @@ -1,12 +1,5 @@ import inspect -import os -from abc import ABC, abstractmethod -from typing import Iterable, Union - -import pandas as pd -import ray - -from graphgen.utils import CURRENT_LOGGER_VAR, set_logger +from graphgen.utils.log import CURRENT_LOGGER_VAR, set_logger class BaseOperator(ABC): @@ -29,7 +22,8 @@ def __init__(self, working_dir: str = "cache", op_name: str = None): log_file = os.path.join(log_dir, f"{self.op_name}_{worker_id_short}.log") self.logger = set_logger( - log_file=log_file, name=f"{self.op_name}.{worker_id_short}", force=True + log_file=log_file, name=f"{self.op_name}.{worker_id_short}", + console_level=logging.ERROR, force=True ) self.logger.info( diff --git a/graphgen/bases/base_reader.py b/graphgen/bases/base_reader.py index 5d2af735..ff9ffb18 100644 --- a/graphgen/bases/base_reader.py +++ b/graphgen/bases/base_reader.py @@ -39,6 +39,7 @@ def _should_keep_item(self, item: Dict[str, Any]) -> bool: "table", "equation", "protein", + ], f"Unsupported item type: {item_type}" if item_type == "text": content = item.get(self.text_column, "").strip() diff --git a/graphgen/bases/base_searcher.py b/graphgen/bases/base_searcher.py index f680ab04..42a26681 100644 --- a/graphgen/bases/base_searcher.py +++ b/graphgen/bases/base_searcher.py @@ -1,18 +1,17 @@ +import logging +import os from abc import ABC, abstractmethod from typing import Any, Dict, List +from graphgen.utils.log import set_logger + class BaseSearcher(ABC): """ Abstract base class for searching and retrieving data. """ - @abstractmethod - async def search(self, query: str, **kwargs) -> List[Dict[str, Any]]: - """ - Search for data based on the given query. - :param query: The searcher query. - :param kwargs: Additional keyword arguments for the searcher. - :return: List of dictionaries containing the searcher results. - """ + def get_logger(self): + """Get the logger instance.""" + return self.logger diff --git a/graphgen/engine.py b/graphgen/engine.py index 62ab5281..cc4c4570 100644 --- a/graphgen/engine.py +++ b/graphgen/engine.py @@ -1,30 +1,24 @@ import inspect import logging -from collections import defaultdict, deque -from functools import wraps -from typing import Any, Callable, Dict, List, Set - -import ray -import ray.data - -from graphgen.bases import Config, Node -from graphgen.utils import logger - - -class Engine: - def __init__( - self, config: Dict[str, Any], functions: Dict[str, Callable], **ray_init_kwargs - ): - self.config = Config(**config) - self.global_params = self.config.global_params - self.functions = functions - self.datasets: Dict[str, ray.data.Dataset] = {} + # Disable Ray Data progress bars and verbose output + os.environ.setdefault("RAY_DATA_DISABLE_PROGRESS_BARS", "1") + # Disable metrics exporter to avoid RpcError + os.environ.setdefault("RAY_DISABLE_IMPORTANT_WARNING", "1") + try: + from ray.data import DataContext + ctx = DataContext.get_current() + ctx.enable_rich_progress_bars = False + ctx.use_ray_tqdm = False + except Exception: + pass # Ray Data context might not be available if not ray.is_initialized(): + # Disable metrics exporter to avoid RpcError + ray_init_kwargs.setdefault("_metrics_export_port", 0) context = ray.init( ignore_reinit_error=True, logging_level=logging.ERROR, - log_to_driver=True, + log_to_driver=False, # Disable Ray logs to driver **ray_init_kwargs, ) logger.info("Ray Dashboard URL: %s", context.dashboard_url) diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py index 21344d74..4606715b 100644 --- a/graphgen/models/__init__.py +++ b/graphgen/models/__init__.py @@ -4,10 +4,11 @@ AtomicGenerator, CoTGenerator, MultiHopGenerator, + OmicsQAGenerator, QuizGenerator, VQAGenerator, ) -from .kg_builder import LightRAGKGBuilder, MMKGBuilder +from .kg_builder import LightRAGKGBuilder, MMKGBuilder, OmicsKGBuilder from .llm import HTTPClient, OllamaClient, OpenAIClient from .partitioner import ( AnchorBFSPartitioner, diff --git a/graphgen/models/evaluator/length_evaluator.py b/graphgen/models/evaluator/length_evaluator.py index d5c33211..9fbc6f3c 100644 --- a/graphgen/models/evaluator/length_evaluator.py +++ b/graphgen/models/evaluator/length_evaluator.py @@ -1,7 +1,8 @@ +import asyncio + from graphgen.bases.datatypes import QAPair from graphgen.models.evaluator.base_evaluator import BaseEvaluator from graphgen.models.tokenizer import Tokenizer -from graphgen.utils import create_event_loop class LengthEvaluator(BaseEvaluator): @@ -11,7 +12,8 @@ def __init__(self, tokenizer_name: str = "cl100k_base", max_concurrent: int = 10 self.tokenizer = Tokenizer(model_name=self.tokenizer_name) async def evaluate_single(self, pair: QAPair) -> float: - loop = create_event_loop() + # In async context, we should use the running loop + loop = asyncio.get_running_loop() return await loop.run_in_executor(None, self._calculate_length, pair.answer) def _calculate_length(self, text: str) -> float: diff --git a/graphgen/models/evaluator/mtld_evaluator.py b/graphgen/models/evaluator/mtld_evaluator.py index c106d86c..3423425b 100644 --- a/graphgen/models/evaluator/mtld_evaluator.py +++ b/graphgen/models/evaluator/mtld_evaluator.py @@ -2,7 +2,7 @@ from graphgen.bases.datatypes import QAPair from graphgen.models.evaluator.base_evaluator import BaseEvaluator -from graphgen.utils import NLTKHelper, create_event_loop, detect_main_language +from graphgen.utils import NLTKHelper, detect_main_language nltk_helper = NLTKHelper() @@ -18,7 +18,9 @@ def __init__(self, max_concurrent: int = 100): self.stopwords_zh: Set[str] = set(nltk_helper.get_stopwords("chinese")) async def evaluate_single(self, pair: QAPair) -> float: - loop = create_event_loop() + # In async context, we should use the running loop + import asyncio + loop = asyncio.get_running_loop() return await loop.run_in_executor(None, self._calculate_mtld_score, pair.answer) def _calculate_mtld_score(self, text: str, threshold=0.72) -> float: diff --git a/graphgen/models/generator/__init__.py b/graphgen/models/generator/__init__.py index 49f8979c..12740f3b 100644 --- a/graphgen/models/generator/__init__.py +++ b/graphgen/models/generator/__init__.py @@ -2,5 +2,6 @@ from .atomic_generator import AtomicGenerator from .cot_generator import CoTGenerator from .multi_hop_generator import MultiHopGenerator + from .quiz_generator import QuizGenerator from .vqa_generator import VQAGenerator diff --git a/graphgen/models/kg_builder/__init__.py b/graphgen/models/kg_builder/__init__.py index 1e7e2c44..cc8ff877 100644 --- a/graphgen/models/kg_builder/__init__.py +++ b/graphgen/models/kg_builder/__init__.py @@ -1,2 +1,3 @@ from .light_rag_kg_builder import LightRAGKGBuilder from .mm_kg_builder import MMKGBuilder + diff --git a/graphgen/models/llm/local/sglang_wrapper.py b/graphgen/models/llm/local/sglang_wrapper.py index e8648613..1918fc79 100644 --- a/graphgen/models/llm/local/sglang_wrapper.py +++ b/graphgen/models/llm/local/sglang_wrapper.py @@ -13,10 +13,10 @@ class SGLangWrapper(BaseLLMWrapper): def __init__( self, model: str, + tp_size: int = 1, temperature: float = 0.0, top_p: float = 1.0, topk: int = 5, - tp_size: int = 1, **kwargs: Any, ): super().__init__(temperature=temperature, top_p=top_p, **kwargs) diff --git a/graphgen/models/llm/local/vllm_wrapper.py b/graphgen/models/llm/local/vllm_wrapper.py index c6e5feac..40eb0d73 100644 --- a/graphgen/models/llm/local/vllm_wrapper.py +++ b/graphgen/models/llm/local/vllm_wrapper.py @@ -1,16 +1,14 @@ -import math import uuid +import math from typing import Any, List, Optional - from graphgen.bases.base_llm_wrapper import BaseLLMWrapper from graphgen.bases.datatypes import Token - class VLLMWrapper(BaseLLMWrapper): """ Async inference backend based on vLLM. """ - +>>>>>>> feature/multi-omics-qa def __init__( self, model: str, @@ -99,11 +97,7 @@ async def generate_topk_per_token( async for request_output in result_generator: final_output = request_output - if ( - not final_output - or not final_output.outputs - or not final_output.outputs[0].logprobs - ): + if not final_output or not final_output.outputs or not final_output.outputs[0].logprobs: return [] top_logprobs = final_output.outputs[0].logprobs[0] diff --git a/graphgen/models/partitioner/anchor_bfs_partitioner.py b/graphgen/models/partitioner/anchor_bfs_partitioner.py index 09133af7..50e607ee 100644 --- a/graphgen/models/partitioner/anchor_bfs_partitioner.py +++ b/graphgen/models/partitioner/anchor_bfs_partitioner.py @@ -1,6 +1,6 @@ import random from collections import deque -from typing import Any, Iterable, List, Literal, Set, Tuple +from typing import Any, Iterable, List, Literal, Set, Tuple, Union from graphgen.bases import BaseGraphStorage from graphgen.bases.datatypes import Community @@ -18,16 +18,18 @@ class AnchorBFSPartitioner(BFSPartitioner): 2. Expand the community using BFS until the max unit size is reached.(A unit is a node or an edge.) 3. Non-anchor units can only be "pulled" into a community and never become seeds themselves. For example, for VQA tasks, we may want to use image nodes as anchors and expand to nearby text nodes and edges. - """ - - def __init__( - self, - *, - anchor_type: Literal["image"] = "image", + anchor_type: Union[ + Literal["image", "dna", "rna", "protein"], + List[Literal["dna", "rna", "protein"]], + ] = "image", anchor_ids: Set[str] | None = None, ) -> None: super().__init__() - self.anchor_type = anchor_type + # Normalize anchor_type to always be a list for internal processing + if isinstance(anchor_type, str): + self.anchor_types = [anchor_type] + else: + self.anchor_types = list(anchor_type) self.anchor_ids = anchor_ids def partition( @@ -68,10 +70,53 @@ def _pick_anchor_ids( return self.anchor_ids anchor_ids: Set[str] = set() + anchor_types_lower = [at.lower() for at in self.anchor_types] + for node_id, meta in nodes: + # Check if node matches any of the anchor types + matched = False + + # Check 1: entity_type (for image, etc.) node_type = str(meta.get("entity_type", "")).lower() - if self.anchor_type.lower() in node_type: + for anchor_type_lower in anchor_types_lower: + if anchor_type_lower in node_type: + anchor_ids.add(node_id) + matched = True + break + + if matched: + continue + + # Check 2: molecule_type (for omics data: dna, rna, protein) + molecule_type = str(meta.get("molecule_type", "")).lower() + if molecule_type in anchor_types_lower: anchor_ids.add(node_id) + continue + + # Check 3: source_id prefix (for omics data: dna-, rna-, protein-) + source_id = str(meta.get("source_id", "")).lower() + for anchor_type_lower in anchor_types_lower: + if source_id.startswith(f"{anchor_type_lower}-"): + anchor_ids.add(node_id) + matched = True + break + + if matched: + continue + + # Check 4: Check if source_id contains multiple IDs separated by + if "" in source_id: + source_ids = source_id.split("") + for sid in source_ids: + sid = sid.strip() + for anchor_type_lower in anchor_types_lower: + if sid.startswith(f"{anchor_type_lower}-"): + anchor_ids.add(node_id) + matched = True + break + if matched: + break + return anchor_ids @staticmethod @@ -113,7 +158,21 @@ def _grow_community( if it in used_e: continue used_e.add(it) - u, v = it + # Convert frozenset to tuple for edge representation + # Note: Self-loops should be filtered during graph construction, + # but we handle edge cases defensively + try: + u, v = tuple(it) + except ValueError: + # Handle edge case: frozenset with unexpected number of elements + # This should not happen if graph construction is correct + edge_nodes = list(it) + if len(edge_nodes) == 1: + # Self-loop edge (should have been filtered during graph construction) + u, v = edge_nodes[0], edge_nodes[0] + else: + # Invalid edge, skip it + continue comm_e.append((u, v)) cnt += 1 for n in it: diff --git a/graphgen/models/reader/json_reader.py b/graphgen/models/reader/json_reader.py index 6752e042..c09453c7 100644 --- a/graphgen/models/reader/json_reader.py +++ b/graphgen/models/reader/json_reader.py @@ -1,46 +1,66 @@ import json -from typing import List, Union +import os +from typing import Any, Dict, Iterator, List, Union import ray import ray.data from graphgen.bases.base_reader import BaseReader + def read_stream(self, file_path: str) -> Iterator[Dict[str, Any]]: + """ + Stream read JSONL files line by line without loading entire file into memory. + Returns an iterator that yields filtered documents. + + :param file_path: Path to the JSONL file. + :return: Iterator of dictionaries containing the data. + """ + if not file_path.endswith(".jsonl"): + raise ValueError("read_stream only supports JSONL files, not JSON files") + with open(file_path, "r", encoding="utf-8") as f: + for line in f: + try: + doc = json.loads(line) + assert "type" in doc, f"Missing 'type' in document: {doc}" + if doc.get("type") == "text" and self.text_column not in doc: + raise ValueError( + f"Missing '{self.text_column}' in document: {doc}" + ) -class JSONReader(BaseReader): - """ - Reader for JSON and JSONL files. - Columns: - - type: The type of the document (e.g., "text", "image", etc.) - - if type is "text", "content" column must be present. - """ + # Apply filtering logic inline (similar to BaseReader.filter) + if doc.get("type") == "text": + content = doc.get(self.text_column, "").strip() + if content: + yield doc + elif doc.get("type") in ("image", "table", "equation"): + img_path = doc.get("img_path") + if self._image_exists(img_path): + yield doc + else: + yield doc + except json.JSONDecodeError as e: + logger.error("Error decoding JSON line: %s. Error: %s", line, e) - def read(self, input_path: Union[str, List[str]]) -> ray.data.Dataset: + @staticmethod + def _image_exists(path_or_url: str, timeout: int = 3) -> bool: """ - Read JSON file and return Ray Dataset. - :param input_path: Path to JSON/JSONL file or list of JSON/JSONL files. - :return: Ray Dataset containing validated and filtered data. + Check if an image exists at the given local path or URL. + :param path_or_url: Local file path or remote URL of the image. + :param timeout: Timeout for remote URL requests in seconds. + :return: True if the image exists, False otherwise. """ - if self.modalities and len(self.modalities) >= 2: - ds: ray.data.Dataset = ray.data.from_items([]) - for file in input_path if isinstance(input_path, list) else [input_path]: - data = [] - if file.endswith(".jsonl"): - with open(file, "r", encoding="utf-8") as f: - for line in f: - item = json.loads(line) - data.append(item) - else: - with open(file, "r", encoding="utf-8") as f: - data = json.load(f) - data = self._unify_schema(data) - file_ds: ray.data.Dataset = ray.data.from_items(data) - ds = ds.union(file_ds) # type: ignore - else: - ds = ray.data.read_json(input_path) - ds = ds.map_batches(self._validate_batch, batch_format="pandas") - ds = ds.filter(self._should_keep_item) - return ds + if not path_or_url: + return False + if not path_or_url.startswith(("http://", "https://", "ftp://")): + path = path_or_url.replace("file://", "", 1) + path = os.path.abspath(path) + return os.path.isfile(path) + try: + import requests + resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout) + return resp.status_code == 200 + except Exception: + return False @staticmethod def _unify_schema(data): diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index f453c700..b43f63a2 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -19,141 +19,12 @@ ) from graphgen.bases import BaseSearcher -from graphgen.utils import logger +>>>>>>> feature/multi-omics-qa @lru_cache(maxsize=None) def _get_pool(): - return ThreadPoolExecutor(max_workers=10) - -# ensure only one NCBI request at a time -_ncbi_lock = asyncio.Lock() - - -class NCBISearch(BaseSearcher): - """ - NCBI Search client to search DNA/GenBank/Entrez databases. - 1) Get the gene/DNA by accession number or gene ID. - 2) Search with keywords or gene names (fuzzy search). - 3) Search with FASTA sequence (BLAST search for DNA sequences). - - API Documentation: https://www.ncbi.nlm.nih.gov/home/develop/api/ - Note: NCBI has rate limits (max 3 requests per second), delays are required between requests. - """ - - def __init__( - self, - use_local_blast: bool = False, - local_blast_db: str = "nt_db", - email: str = "email@example.com", - api_key: str = "", - tool: str = "GraphGen", - ): - """ - Initialize the NCBI Search client. - - Args: - use_local_blast (bool): Whether to use local BLAST database. - local_blast_db (str): Path to the local BLAST database. - email (str): Email address for NCBI API requests. - api_key (str): API key for NCBI API requests, see https://account.ncbi.nlm.nih.gov/settings/. - tool (str): Tool name for NCBI API requests. - """ - super().__init__() - Entrez.timeout = 60 # 60 seconds timeout - Entrez.email = email - Entrez.tool = tool - if api_key: - Entrez.api_key = api_key - Entrez.max_tries = 10 if api_key else 3 - Entrez.sleep_between_tries = 5 - self.use_local_blast = use_local_blast - self.local_blast_db = local_blast_db - if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"): - logger.error("Local BLAST database files not found. Please check the path.") - self.use_local_blast = False - - @staticmethod - def _nested_get(data: dict, *keys, default=None): - """Safely traverse nested dictionaries.""" - for key in keys: - if not isinstance(data, dict): - return default - data = data.get(key, default) - return data - - @staticmethod - def _infer_molecule_type_detail(accession: Optional[str], gene_type: Optional[int] = None) -> Optional[str]: - """Infer molecule_type_detail from accession prefix or gene type.""" - if accession: - if accession.startswith(("NM_", "XM_")): - return "mRNA" - if accession.startswith(("NC_", "NT_")): - return "genomic DNA" - if accession.startswith(("NR_", "XR_")): - return "RNA" - if accession.startswith("NG_"): - return "genomic region" - # Fallback: infer from gene type if available - if gene_type is not None: - gene_type_map = { - 3: "rRNA", - 4: "tRNA", - 5: "snRNA", - 6: "ncRNA", - } - return gene_type_map.get(gene_type) - return None - - def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: - """ - Convert an Entrez gene record to a dictionary. - All extraction logic is inlined for maximum clarity and performance. - """ - if not gene_record: - raise ValueError("Empty gene record") - - data = gene_record[0] - locus = (data.get("Entrezgene_locus") or [{}])[0] - - # Extract common nested paths once - gene_ref = self._nested_get(data, "Entrezgene_gene", "Gene-ref", default={}) - biosource = self._nested_get(data, "Entrezgene_source", "BioSource", default={}) - - # Process synonyms - synonyms_raw = gene_ref.get("Gene-ref_syn", []) - gene_synonyms = [] - if isinstance(synonyms_raw, list): - for syn in synonyms_raw: - gene_synonyms.append(syn.get("Gene-ref_syn_E") if isinstance(syn, dict) else str(syn)) - elif synonyms_raw: - gene_synonyms.append(str(synonyms_raw)) - - # Extract location info - label = locus.get("Gene-commentary_label", "") - chromosome_match = re.search(r"Chromosome\s+(\S+)", str(label)) if label else None - - seq_interval = self._nested_get( - locus, "Gene-commentary_seqs", 0, "Seq-loc_int", "Seq-interval", default={} - ) - genomic_location = ( - f"{seq_interval.get('Seq-interval_from')}-{seq_interval.get('Seq-interval_to')}" - if seq_interval.get('Seq-interval_from') and seq_interval.get('Seq-interval_to') - else None - ) - - # Extract representative accession (prefer type 3 = mRNA/transcript) - representative_accession = next( - ( - product.get("Gene-commentary_accession") - for product in locus.get("Gene-commentary_products", []) - if product.get("Gene-commentary_type") == "3" - ), - None, - ) - # Fallback: if no type 3 accession, try any available accession - # This is needed for genes that don't have mRNA transcripts but have other sequence records if not representative_accession: representative_accession = next( ( @@ -209,6 +80,12 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: "_representative_accession": representative_accession, } + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + ) def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]: """Get gene information by Gene ID.""" def _extract_metadata_from_genbank(result: dict, accession: str): @@ -217,12 +94,7 @@ def _extract_metadata_from_genbank(result: dict, accession: str): record = SeqIO.read(handle, "genbank") result["title"] = record.description - result["molecule_type_detail"] = ( - "mRNA" if accession.startswith(("NM_", "XM_")) else - "genomic DNA" if accession.startswith(("NC_", "NT_")) else - "RNA" if accession.startswith(("NR_", "XR_")) else - "genomic region" if accession.startswith("NG_") else "N/A" - ) + result["molecule_type_detail"] = self._infer_molecule_type_detail(accession) or "N/A" for feature in record.features: if feature.type == "source": @@ -249,7 +121,7 @@ def _extract_sequence_from_fasta(result: dict, accession: str): result["sequence"] = str(fasta_record.seq) result["sequence_length"] = len(fasta_record.seq) except Exception as fasta_exc: - logger.warning( + self.logger.warning( "Failed to extract sequence from accession %s using FASTA format: %s", accession, fasta_exc ) @@ -257,25 +129,62 @@ def _extract_sequence_from_fasta(result: dict, accession: str): result["sequence_length"] = None return result + def _extract_sequence(result: dict, accession: str): + """ + Extract sequence using the appropriate method based on configuration. + If use_local_blast=True, use local database. Otherwise, use NCBI API. + Always fetches sequence (no option to skip). + """ + # If using local BLAST, use local database + if self.use_local_blast: + sequence = self._extract_sequence_from_local_db(accession) + + if sequence: + result["sequence"] = sequence + result["sequence_length"] = len(sequence) + else: + # Failed to extract from local DB, set to None (no fallback to API) + result["sequence"] = None + result["sequence_length"] = None + self.logger.warning( + "Failed to extract sequence from local DB for accession %s. " + "Not falling back to NCBI API as use_local_blast=True.", + accession + ) + else: + # Use NCBI API to fetch sequence + result = _extract_sequence_from_fasta(result, accession) + + return result + try: with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle: gene_record = Entrez.read(handle) - if not gene_record: - return None - result = self._gene_record_to_dict(gene_record, gene_id) - if accession := (preferred_accession or result.get("_representative_accession")): - result = _extract_metadata_from_genbank(result, accession) - result = _extract_sequence_from_fasta(result, accession) + if not gene_record: + return None + + result = self._gene_record_to_dict(gene_record, gene_id) - result.pop("_representative_accession", None) - return result + if accession := (preferred_accession or result.get("_representative_accession")): + result = _extract_metadata_from_genbank(result, accession) + # Extract sequence using appropriate method + result = _extract_sequence(result, accession) + + result.pop("_representative_accession", None) + return result except (RequestException, IncompleteRead): raise except Exception as exc: - logger.error("Gene ID %s not found: %s", gene_id, exc) + self.logger.error("Gene ID %s not found: %s", gene_id, exc) return None + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + ) def get_by_accession(self, accession: str) -> Optional[dict]: """Get sequence information by accession number.""" def _extract_gene_id(link_handle): @@ -297,20 +206,28 @@ def _extract_gene_id(link_handle): gene_id = _extract_gene_id(link_handle) if not gene_id: - logger.warning("Accession %s has no associated GeneID", accession) + self.logger.warning("Accession %s has no associated GeneID", accession) return None result = self.get_by_gene_id(gene_id, preferred_accession=accession) + if result: result["id"] = accession result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}" + return result except (RequestException, IncompleteRead): raise except Exception as exc: - logger.error("Accession %s not found: %s", accession, exc) + self.logger.error("Accession %s not found: %s", accession, exc) return None + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + ) def get_best_hit(self, keyword: str) -> Optional[dict]: """Search NCBI Gene database with a keyword and return the best hit.""" if not keyword.strip(): @@ -320,31 +237,87 @@ def get_best_hit(self, keyword: str) -> Optional[dict]: for search_term in [f"{keyword}[Gene] OR {keyword}[All Fields]", keyword]: with Entrez.esearch(db="gene", term=search_term, retmax=1, sort="relevance") as search_handle: search_results = Entrez.read(search_handle) - if len(gene_id := search_results.get("IdList", [])) > 0: - return self.get_by_gene_id(gene_id) + + if len(gene_id := search_results.get("IdList", [])) > 0: + result = self.get_by_gene_id(gene_id[0]) + return result except (RequestException, IncompleteRead): raise except Exception as e: - logger.error("Keyword %s not found: %s", keyword, e) + self.logger.error("Keyword %s not found: %s", keyword, e) return None + def _extract_sequence_from_local_db(self, accession: str) -> Optional[str]: + """Extract sequence from local BLAST database using blastdbcmd.""" + try: + cmd = [ + "blastdbcmd", + "-db", self.local_blast_db, + "-entry", accession, + "-outfmt", "%s" # Only sequence, no header + ] + sequence = subprocess.check_output( + cmd, + text=True, + timeout=10, # 10 second timeout for local extraction + stderr=subprocess.DEVNULL + ).strip() + return sequence if sequence else None + except subprocess.TimeoutExpired: + self.logger.warning("Timeout extracting sequence from local DB for accession %s", accession) + return None + except Exception as exc: + self.logger.warning("Failed to extract sequence from local DB for accession %s: %s", accession, exc) + return None + def _local_blast(self, seq: str, threshold: float) -> Optional[str]: - """Perform local BLAST search using local BLAST database.""" + """ + Perform local BLAST search using local BLAST database. + Optimized with multi-threading and faster output format. + """ try: with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp: tmp.write(f">query\n{seq}\n") tmp_name = tmp.name + # Optimized BLAST command with: + # - num_threads: Use multiple threads for faster search + # - outfmt 6 sacc: Only return accession (minimal output) + # - max_target_seqs 1: Only need the best hit + # - evalue: Threshold for significance cmd = [ "blastn", "-db", self.local_blast_db, "-query", tmp_name, - "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc" + "-evalue", str(threshold), + "-max_target_seqs", "1", + "-num_threads", str(self.blast_num_threads), + "-outfmt", "6 sacc" # Only accession, tab-separated ] - logger.debug("Running local blastn: %s", " ".join(cmd)) - out = subprocess.check_output(cmd, text=True).strip() + self.logger.debug("Running local blastn (threads=%d): %s", + self.blast_num_threads, " ".join(cmd)) + + # Run BLAST with timeout to avoid hanging + try: + out = subprocess.check_output( + cmd, + text=True, + timeout=300, # 5 minute timeout for BLAST search + stderr=subprocess.DEVNULL # Suppress BLAST warnings to reduce I/O + ).strip() + except subprocess.TimeoutExpired: + self.logger.warning("BLAST search timed out after 5 minutes for sequence") + os.remove(tmp_name) + return None + os.remove(tmp_name) return out.split("\n", maxsplit=1)[0] if out else None except Exception as exc: - logger.error("Local blastn failed: %s", exc) + self.logger.error("Local blastn failed: %s", exc) + # Clean up temp file if it still exists + try: + if 'tmp_name' in locals(): + os.remove(tmp_name) + except Exception: + pass return None def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: @@ -362,13 +335,13 @@ def _extract_and_normalize_sequence(sequence: str) -> Optional[str]: def _process_network_blast_result(blast_record, seq: str, threshold: float) -> Optional[dict]: """Process network BLAST result and return dictionary or None.""" if not blast_record.alignments: - logger.info("No BLAST hits found for the given sequence.") + self.logger.info("No BLAST hits found for the given sequence.") return None best_alignment = blast_record.alignments[0] best_hsp = best_alignment.hsps[0] if best_hsp.expect > threshold: - logger.info("No BLAST hits below the threshold E-value.") + self.logger.info("No BLAST hits below the threshold E-value.") return None hit_id = best_alignment.hit_id @@ -389,23 +362,35 @@ def _process_network_blast_result(blast_record, seq: str, threshold: float) -> O try: if not (seq := _extract_and_normalize_sequence(sequence)): - logger.error("Empty or invalid DNA sequence provided.") + self.logger.error("Empty or invalid DNA sequence provided.") return None # Try local BLAST first if enabled - if self.use_local_blast and (accession := self._local_blast(seq, threshold)): - logger.debug("Local BLAST found accession: %s", accession) - return self.get_by_accession(accession) - - # Fall back to network BLAST - logger.debug("Falling back to NCBIWWW.qblast") + if self.use_local_blast: + accession = self._local_blast(seq, threshold) + + if accession: + self.logger.debug("Local BLAST found accession: %s", accession) + # When using local BLAST, skip sequence fetching by default (faster, fewer API calls) + # Sequence is already known from the query, so we only need metadata + result = self.get_by_accession(accession) + return result + + self.logger.info( + "Local BLAST found no match for sequence. " + "API fallback disabled when using local database." + ) + return None + # Fall back to network BLAST only if local BLAST is not enabled + self.logger.debug("Falling back to NCBIWWW.qblast") with NCBIWWW.qblast("blastn", "nr", seq, hitlist_size=1, expect=threshold) as result_handle: - return _process_network_blast_result(NCBIXML.read(result_handle), seq, threshold) + result = _process_network_blast_result(NCBIXML.read(result_handle), seq, threshold) + return result except (RequestException, IncompleteRead): raise except Exception as e: - logger.error("BLAST search failed: %s", e) + self.logger.error("BLAST search failed: %s", e) return None @retry( @@ -417,25 +402,34 @@ def _process_network_blast_result(blast_record, seq: str, threshold: float) -> O async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optional[Dict]: """Search NCBI with either a gene ID, accession number, keyword, or DNA sequence.""" if not query or not isinstance(query, str): - logger.error("Empty or non-string input.") + self.logger.error("Empty or non-string input.") return None query = query.strip() - logger.debug("NCBI search query: %s", query) + self.logger.debug("NCBI search query: %s", query) loop = asyncio.get_running_loop() - # limit concurrent requests (NCBI rate limit: max 3 requests per second) - async with _ncbi_lock: - # Auto-detect query type and execute in thread pool - if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I): - result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold) - elif re.fullmatch(r"^\d+$", query): - result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query) - elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I): - result = await loop.run_in_executor(_get_pool(), self.get_by_accession, query) - else: - result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query) + # Auto-detect query type and execute in thread pool + # All methods need lock because they all call NCBI API (rate limit: max 3 requests per second) + # Even if get_by_fasta uses local BLAST, it still calls get_by_accession which needs API + async def _execute_with_lock(func, *args): + """Execute function with lock for NCBI API calls.""" + async with _blast_lock: + return await loop.run_in_executor(_get_pool(), func, *args) + + if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I): + # FASTA sequence: always use lock (even with local BLAST, get_by_accession needs API) + result = await _execute_with_lock(self.get_by_fasta, query, threshold) + elif re.fullmatch(r"^\d+$", query): + # Gene ID: always use lock (network API call) + result = await _execute_with_lock(self.get_by_gene_id, query) + elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I): + # Accession: always use lock (network API call) + result = await _execute_with_lock(self.get_by_accession, query) + else: + # Keyword: always use lock (network API call) + result = await _execute_with_lock(self.get_best_hit, query) if result: result["_search_query"] = query diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index 58c5e86e..3654b60d 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -18,12 +18,12 @@ ) from graphgen.bases import BaseSearcher -from graphgen.utils import logger +>>>>>>> feature/multi-omics-qa @lru_cache(maxsize=None) def _get_pool(): - return ThreadPoolExecutor(max_workers=10) + return ThreadPoolExecutor(max_workers=20) # NOTE:can increase for better parallelism class RNACentralSearch(BaseSearcher): """ @@ -35,14 +35,24 @@ class RNACentralSearch(BaseSearcher): API Documentation: https://rnacentral.org/api/v1 """ - def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db"): - super().__init__() + def __init__( + self, + use_local_blast: bool = False, + local_blast_db: str = "rna_db", + api_timeout: int = 30, + blast_num_threads: int = 4, + working_dir: str = "cache", + ): + super().__init__(working_dir=working_dir) self.base_url = "https://rnacentral.org/api/v1" self.headers = {"Accept": "application/json"} self.use_local_blast = use_local_blast self.local_blast_db = local_blast_db + self.api_timeout = api_timeout + self.blast_num_threads = blast_num_threads # Number of threads for BLAST search + if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"): - logger.error("Local BLAST database files not found. Please check the path.") + self.logger.error("Local BLAST database files not found. Please check the path.") self.use_local_blast = False @staticmethod @@ -58,7 +68,8 @@ def _rna_data_to_dict( acc = xref.get("accession", {}) if s := acc.get("species"): organisms.add(s) - if g := acc.get("gene", "").strip(): + gene_value = acc.get("gene") + if isinstance(gene_value, str) and (g := gene_value.strip()): gene_names.add(g) if m := xref.get("modifications"): modifications.extend(m) @@ -141,29 +152,29 @@ def _calculate_md5(sequence: str) -> str: return hashlib.md5(normalized_seq.encode("ascii")).hexdigest() - def get_by_rna_id(self, rna_id: str) -> Optional[dict]: - """ - Get RNA information by RNAcentral ID. - :param rna_id: RNAcentral ID (e.g., URS0000000001). - :return: A dictionary containing RNA information or None if not found. - """ - try: - url = f"{self.base_url}/rna/{rna_id}" - url += "?flat=true" - - resp = requests.get(url, headers=self.headers, timeout=30) + resp = requests.get(url, headers=self.headers, timeout=self.api_timeout) resp.raise_for_status() rna_data = resp.json() xrefs_data = rna_data.get("xrefs", []) - return self._rna_data_to_dict(rna_id, rna_data, xrefs_data) + result = self._rna_data_to_dict(rna_id, rna_data, xrefs_data) + return result + except requests.Timeout as e: + self.logger.warning("Timeout getting RNA ID %s (timeout=%ds): %s", rna_id, self.api_timeout, e) + return None except requests.RequestException as e: - logger.error("Network error getting RNA ID %s: %s", rna_id, e) + self.logger.error("Network error getting RNA ID %s: %s", rna_id, e) return None except Exception as e: # pylint: disable=broad-except - logger.error("Unexpected error getting RNA ID %s: %s", rna_id, e) + self.logger.error("Unexpected error getting RNA ID %s: %s", rna_id, e) return None + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=10), + retry=retry_if_exception_type((requests.Timeout, requests.RequestException)), + reraise=False, + ) def get_best_hit(self, keyword: str) -> Optional[dict]: """ Search RNAcentral with a keyword and return the best hit. @@ -172,20 +183,20 @@ def get_best_hit(self, keyword: str) -> Optional[dict]: """ keyword = keyword.strip() if not keyword: - logger.warning("Empty keyword provided to get_best_hit") + self.logger.warning("Empty keyword provided to get_best_hit") return None try: url = f"{self.base_url}/rna" params = {"search": keyword, "format": "json"} - resp = requests.get(url, params=params, headers=self.headers, timeout=30) + resp = requests.get(url, params=params, headers=self.headers, timeout=self.api_timeout) resp.raise_for_status() data = resp.json() results = data.get("results", []) if not results: - logger.info("No search results for keyword: %s", keyword) + self.logger.info("No search results for keyword: %s", keyword) return None first_result = results[0] @@ -195,33 +206,65 @@ def get_best_hit(self, keyword: str) -> Optional[dict]: detailed = self.get_by_rna_id(rna_id) if detailed: return detailed - logger.debug("Using search result data for %s", rna_id or "unknown") + self.logger.debug("Using search result data for %s", rna_id or "unknown") return self._rna_data_to_dict(rna_id or "", first_result) except requests.RequestException as e: - logger.error("Network error searching keyword '%s': %s", keyword, e) + self.logger.error("Network error searching keyword '%s': %s", keyword, e) return None except Exception as e: - logger.error("Unexpected error searching keyword '%s': %s", keyword, e) + self.logger.error("Unexpected error searching keyword '%s': %s", keyword, e) return None def _local_blast(self, seq: str, threshold: float) -> Optional[str]: - """Perform local BLAST search using local BLAST database.""" + """ + Perform local BLAST search using local BLAST database. + Optimized with multi-threading and faster output format. + """ try: + # Use temporary file for query sequence with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp: tmp.write(f">query\n{seq}\n") tmp_name = tmp.name + # Optimized BLAST command with: + # - num_threads: Use multiple threads for faster search + # - outfmt 6 sacc: Only return accession (minimal output) + # - max_target_seqs 1: Only need the best hit + # - evalue: Threshold for significance cmd = [ "blastn", "-db", self.local_blast_db, "-query", tmp_name, - "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc" + "-evalue", str(threshold), + "-max_target_seqs", "1", + "-num_threads", str(self.blast_num_threads), + "-outfmt", "6 sacc" # Only accession, tab-separated ] - logger.debug("Running local blastn for RNA: %s", " ".join(cmd)) - out = subprocess.check_output(cmd, text=True).strip() + self.logger.debug("Running local blastn for RNA (threads=%d): %s", + self.blast_num_threads, " ".join(cmd)) + + # Run BLAST with timeout to avoid hanging + try: + out = subprocess.check_output( + cmd, + text=True, + timeout=300, # 5 minute timeout for BLAST search + stderr=subprocess.DEVNULL # Suppress BLAST warnings to reduce I/O + ).strip() + except subprocess.TimeoutExpired: + self.logger.warning("BLAST search timed out after 5 minutes for sequence") + os.remove(tmp_name) + return None + os.remove(tmp_name) return out.split("\n", maxsplit=1)[0] if out else None except Exception as exc: - logger.error("Local blastn failed: %s", exc) + self.logger.error("Local blastn failed: %s", exc) + # Clean up temp file if it still exists + try: + if 'tmp_name' in locals(): + os.remove(tmp_name) + except Exception: + pass return None def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: @@ -240,23 +283,36 @@ def _extract_sequence(sequence: str) -> Optional[str]: seq = "".join(seq_lines[1:]) else: seq = sequence.strip().replace(" ", "").replace("\n", "") - return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None + # Accept both U (original RNA) and T + return seq if seq and re.fullmatch(r"[AUCGTN\s]+", seq, re.I) else None try: seq = _extract_sequence(sequence) if not seq: - logger.error("Empty or invalid RNA sequence provided.") + self.logger.error("Empty or invalid RNA sequence provided.") return None # Try local BLAST first if enabled if self.use_local_blast: accession = self._local_blast(seq, threshold) if accession: - logger.debug("Local BLAST found accession: %s", accession) - return self.get_by_rna_id(accession) + self.logger.debug("Local BLAST found accession: %s", accession) + detailed = self.get_by_rna_id(accession) + if detailed: + return detailed + self.logger.info( + "Local BLAST found accession %s but could not retrieve metadata from API.", + accession + ) + return None + self.logger.info( + "Local BLAST found no match for sequence. " + "API fallback disabled when using local database." + ) + return None - # Fall back to RNAcentral API if local BLAST didn't find result - logger.debug("Falling back to RNAcentral API.") + # Fall back to RNAcentral API only if local BLAST is not enabled + self.logger.debug("Falling back to RNAcentral API.") md5_hash = self._calculate_md5(seq) search_url = f"{self.base_url}/rna" @@ -269,15 +325,22 @@ def _extract_sequence(sequence: str) -> Optional[str]: results = search_results.get("results", []) if not results: - logger.info("No exact match found in RNAcentral for sequence") + self.logger.info("No exact match found in RNAcentral for sequence") return None + rna_id = results[0].get("rnacentral_id") - if not rna_id: - logger.error("No RNAcentral ID found in search results.") - return None - return self.get_by_rna_id(rna_id) + if rna_id: + detailed = self.get_by_rna_id(rna_id) + if detailed: + return detailed + # Fallback: use search result data if get_by_rna_id returns None + self.logger.debug("Using search result data for %s (get_by_rna_id returned None)", rna_id) + return self._rna_data_to_dict(rna_id, results[0]) + + self.logger.error("No RNAcentral ID found in search results.") + return None except Exception as e: - logger.error("Sequence search failed: %s", e) + self.logger.error("Sequence search failed: %s", e) return None @retry( @@ -289,18 +352,21 @@ def _extract_sequence(sequence: str) -> Optional[str]: async def search(self, query: str, threshold: float = 0.1, **kwargs) -> Optional[Dict]: """Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence.""" if not query or not isinstance(query, str): - logger.error("Empty or non-string input.") + self.logger.error("Empty or non-string input.") return None query = query.strip() - logger.debug("RNAcentral search query: %s", query) + self.logger.debug("RNAcentral search query: %s", query) loop = asyncio.get_running_loop() - # check if RNA sequence (AUCG characters, contains U) - if query.startswith(">") or ( - re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper() - ): + # check if RNA sequence (AUCG or ATCG characters, contains U or T) + # Note: Sequences with T are also RNA sequences + is_rna_sequence = query.startswith(">") or ( + re.fullmatch(r"[AUCGTN\s]+", query, re.I) and + ("U" in query.upper() or "T" in query.upper()) + ) + if is_rna_sequence: result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold) # check if RNAcentral ID (typically starts with URS) elif re.fullmatch(r"URS\d+", query, re.I): diff --git a/graphgen/models/searcher/db/uniprot_searcher.py b/graphgen/models/searcher/db/uniprot_searcher.py index f5542f8c..31876c38 100644 --- a/graphgen/models/searcher/db/uniprot_searcher.py +++ b/graphgen/models/searcher/db/uniprot_searcher.py @@ -19,12 +19,12 @@ ) from graphgen.bases import BaseSearcher -from graphgen.utils import logger +>>>>>>> feature/multi-omics-qa @lru_cache(maxsize=None) def _get_pool(): - return ThreadPoolExecutor(max_workers=10) + return ThreadPoolExecutor(max_workers=20) # NOTE:can increase for better parallelism # ensure only one BLAST searcher at a time @@ -39,12 +39,20 @@ class UniProtSearch(BaseSearcher): 3) Search with FASTA sequence (BLAST searcher). Note that NCBIWWW does not support async. """ - def __init__(self, use_local_blast: bool = False, local_blast_db: str = "sp_db"): - super().__init__() + def __init__( + self, + use_local_blast: bool = False, + local_blast_db: str = "sp_db", + blast_num_threads: int = 4, + working_dir: str = "cache", + ): + super().__init__(working_dir=working_dir) self.use_local_blast = use_local_blast self.local_blast_db = local_blast_db + self.blast_num_threads = blast_num_threads # Number of threads for BLAST search + if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.phr"): - logger.error("Local BLAST database files not found. Please check the path.") + self.logger.error("Local BLAST database files not found. Please check the path.") self.use_local_blast = False def get_by_accession(self, accession: str) -> Optional[dict]: @@ -56,7 +64,7 @@ def get_by_accession(self, accession: str) -> Optional[dict]: except RequestException: # network-related errors raise except Exception as exc: # pylint: disable=broad-except - logger.error("Accession %s not found: %s", accession, exc) + self.logger.error("Accession %s not found: %s", accession, exc) return None @staticmethod @@ -101,7 +109,7 @@ def get_best_hit(self, keyword: str) -> Optional[Dict]: except RequestException: raise except Exception as e: # pylint: disable=broad-except - logger.error("Keyword %s not found: %s", keyword, e) + self.logger.error("Keyword %s not found: %s", keyword, e) return None def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: @@ -117,70 +125,69 @@ def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: else: seq = fasta_sequence.strip() except Exception as e: # pylint: disable=broad-except - logger.error("Invalid FASTA sequence: %s", e) + self.logger.error("Invalid FASTA sequence: %s", e) return None if not seq: - logger.error("Empty FASTA sequence provided.") + self.logger.error("Empty FASTA sequence provided.") return None - accession = None if self.use_local_blast: accession = self._local_blast(seq, threshold) if accession: - logger.debug("Local BLAST found accession: %s", accession) + self.logger.debug("Local BLAST found accession: %s", accession) + return self.get_by_accession(accession) + self.logger.info( + "Local BLAST found no match for sequence. " + "API fallback disabled when using local database." + ) + return None - if not accession: - logger.debug("Falling back to NCBIWWW.qblast.") + # Fall back to network BLAST only if local BLAST is not enabled + self.logger.debug("Falling back to NCBIWWW.qblast.") - # UniProtKB/Swiss-Prot BLAST API - try: - logger.debug( - "Performing BLAST searcher for the given sequence: %s", seq - ) - result_handle = NCBIWWW.qblast( - program="blastp", - database="swissprot", - sequence=seq, - hitlist_size=1, - expect=threshold, - ) - blast_record = NCBIXML.read(result_handle) - except RequestException: - raise - except Exception as e: # pylint: disable=broad-except - logger.error("BLAST searcher failed: %s", e) - return None + # UniProtKB/Swiss-Prot BLAST API + try: + self.logger.debug( + "Performing BLAST searcher for the given sequence: %s", seq + ) + result_handle = NCBIWWW.qblast( + program="blastp", + database="swissprot", + sequence=seq, + hitlist_size=1, + expect=threshold, + ) + blast_record = NCBIXML.read(result_handle) + except RequestException: + raise + except Exception as e: # pylint: disable=broad-except + self.logger.error("BLAST searcher failed: %s", e) + return None - if not blast_record.alignments: - logger.info("No BLAST hits found for the given sequence.") - return None + if not blast_record.alignments: + self.logger.info("No BLAST hits found for the given sequence.") + return None - best_alignment = blast_record.alignments[0] - best_hsp = best_alignment.hsps[0] - if best_hsp.expect > threshold: - logger.info("No BLAST hits below the threshold E-value.") - return None - hit_id = best_alignment.hit_id + best_alignment = blast_record.alignments[0] + best_hsp = best_alignment.hsps[0] + if best_hsp.expect > threshold: + self.logger.info("No BLAST hits below the threshold E-value.") + return None - # like sp|P01308.1|INS_HUMAN - accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id + # like sp|P01308.1|INS_HUMAN + hit_id = best_alignment.hit_id + accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id return self.get_by_accession(accession) def _local_blast(self, seq: str, threshold: float) -> Optional[str]: """ Perform local BLAST search using local BLAST database. - :param seq: The protein sequence. - :param threshold: E-value threshold for BLAST searcher. - :return: The accession number of the best hit or None if not found. - """ - try: - with tempfile.NamedTemporaryFile( - mode="w+", suffix=".fa", delete=False - ) as tmp: - tmp.write(f">query\n{seq}\n") - tmp_name = tmp.name - + # Optimized BLAST command with: + # - num_threads: Use multiple threads for faster search + # - outfmt 6 sacc: Only return accession (minimal output) + # - max_target_seqs 1: Only need the best hit + # - evalue: Threshold for significance cmd = [ "blastp", "-db", @@ -191,17 +198,33 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]: str(threshold), "-max_target_seqs", "1", + "-num_threads", + str(self.blast_num_threads), "-outfmt", - "6 sacc", # only return accession + "6 sacc", # Only accession, tab-separated ] - logger.debug("Running local blastp: %s", " ".join(cmd)) - out = subprocess.check_output(cmd, text=True).strip() + self.logger.debug("Running local blastp (threads=%d): %s", + self.blast_num_threads, " ".join(cmd)) + + # Run BLAST with timeout to avoid hanging + try: + out = subprocess.check_output( + cmd, + text=True, + timeout=300, # 5 minute timeout for BLAST search + stderr=subprocess.DEVNULL # Suppress BLAST warnings to reduce I/O + ).strip() + except subprocess.TimeoutExpired: + self.logger.warning("BLAST search timed out after 5 minutes for sequence") + os.remove(tmp_name) + return None + os.remove(tmp_name) if out: return out.split("\n", maxsplit=1)[0] return None except Exception as exc: # pylint: disable=broad-except - logger.error("Local blastp failed: %s", exc) + self.logger.error("Local blastp failed: %s", exc) return None @retry( @@ -222,11 +245,11 @@ async def search( # auto detect query type if not query or not isinstance(query, str): - logger.error("Empty or non-string input.") + self.logger.error("Empty or non-string input.") return None query = query.strip() - logger.debug("UniProt searcher query: %s", query) + self.logger.debug("UniProt searcher query: %s", query) loop = asyncio.get_running_loop() @@ -234,13 +257,23 @@ async def search( if query.startswith(">") or re.fullmatch( r"[ACDEFGHIKLMNPQRSTVWY\s]+", query, re.I ): - async with _blast_lock: + # Only use lock for network BLAST (NCBIWWW), local BLAST can run in parallel + if self.use_local_blast: + # Local BLAST can run in parallel, no lock needed result = await loop.run_in_executor( _get_pool(), self.get_by_fasta, query, threshold ) + else: + # Network BLAST needs lock to respect rate limits + async with _blast_lock: + result = await loop.run_in_executor( + _get_pool(), self.get_by_fasta, query, threshold + ) # check if accession number - elif re.fullmatch(r"[A-NR-Z0-9]{6,10}", query, re.I): + # UniProt accession IDs: 6-10 characters, must start with a letter + # Format: [A-Z][A-Z0-9]{5,9} (6-10 chars total: 1 letter + 5-9 alphanumeric) + elif re.fullmatch(r"[A-Z][A-Z0-9]{5,9}", query, re.I): result = await loop.run_in_executor( _get_pool(), self.get_by_accession, query ) diff --git a/graphgen/models/storage/kv/json_storage.py b/graphgen/models/storage/kv/json_storage.py index aa7c6f42..aa2bce19 100644 --- a/graphgen/models/storage/kv/json_storage.py +++ b/graphgen/models/storage/kv/json_storage.py @@ -1,46 +1,40 @@ import os from dataclasses import dataclass - -from graphgen.bases.base_storage import BaseKVStorage -from graphgen.utils import load_json, write_json - - -@dataclass -class JsonKVStorage(BaseKVStorage): - _data: dict[str, dict] = None - - def __post_init__(self): - self._file_name = os.path.join(self.working_dir, f"{self.namespace}.json") - self._data = load_json(self._file_name) or {} - print(f"Load KV {self.namespace} with {len(self._data)} data") - - @property - def data(self): - return self._data - - def all_keys(self) -> list[str]: - return list(self._data.keys()) - - def index_done_callback(self): - write_json(self._data, self._file_name) - - def get_by_id(self, id): - return self._data.get(id, None) - - def get_by_ids(self, ids, fields=None) -> list: - if fields is None: - return [self._data.get(id, None) for id in ids] - return [ - ( - {k: v for k, v in self._data[id].items() if k in fields} - if self._data.get(id, None) - else None - ) - for id in ids - ] - - def get_all(self) -> dict[str, dict]: - return self._data + def iter_items(self) -> Iterator[Tuple[str, dict]]: + """ + Iterate over all items without loading everything into memory at once. + Returns an iterator of (key, value) tuples. + """ + for key, value in self._data.items(): + yield key, value + + def get_batch(self, keys: list[str]) -> dict[str, dict]: + """ + Get a batch of items by their keys. + + :param keys: List of keys to retrieve. + :return: Dictionary of {key: value} for the requested keys. + """ + return {key: self._data.get(key) for key in keys if key in self._data} + + def iter_batches(self, batch_size: int = 10000) -> Iterator[dict[str, dict]]: + """ + Iterate over items in batches to avoid loading everything into memory. + + :param batch_size: Number of items per batch. + :return: Iterator of dictionaries, each containing up to batch_size items. + """ + batch = {} + count = 0 + for key, value in self._data.items(): + batch[key] = value + count += 1 + if count >= batch_size: + yield batch + batch = {} + count = 0 + if batch: + yield batch def filter_keys(self, data: list[str]) -> set[str]: return {s for s in data if s not in self._data} diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py index 64c78af5..d8e78927 100644 --- a/graphgen/operators/__init__.py +++ b/graphgen/operators/__init__.py @@ -5,8 +5,8 @@ from .judge import JudgeService from .partition import PartitionService from .quiz import QuizService -from .read import read -from .search import search_all +from .read import read, read_files +from .search import SearchService operators = { "read": read, @@ -15,7 +15,7 @@ "quiz": QuizService, "judge": JudgeService, "extract": ExtractService, - "search": search_all, + "search": SearchService, "partition": PartitionService, "generate": GenerateService, } diff --git a/graphgen/operators/build_kg/build_kg_service.py b/graphgen/operators/build_kg/build_kg_service.py index ec3c7cc1..aabba3b9 100644 --- a/graphgen/operators/build_kg/build_kg_service.py +++ b/graphgen/operators/build_kg/build_kg_service.py @@ -8,15 +8,11 @@ from graphgen.utils import logger from .build_mm_kg import build_mm_kg -from .build_text_kg import build_text_kg - - -class BuildKGService(BaseOperator): - def __init__(self, working_dir: str = "cache", graph_backend: str = "kuzu"): + def __init__(self, working_dir: str = "cache"): super().__init__(working_dir=working_dir, op_name="build_kg_service") self.llm_client: BaseLLMWrapper = init_llm("synthesizer") self.graph_storage: BaseGraphStorage = init_storage( - backend=graph_backend, working_dir=working_dir, namespace="graph" + backend="kuzu", working_dir=working_dir, namespace="graph" ) def process(self, batch: pd.DataFrame) -> pd.DataFrame: @@ -37,24 +33,17 @@ def build_kg(self, chunks: List[Chunk]) -> None: for chunk in chunks if chunk.type in ("image", "video", "table", "formula") ] - - if len(text_chunks) == 0: - logger.info("All text chunks are already in the storage") + if len(omics_chunks) == 0: + logger.info("All omics chunks are already in the storage") else: - logger.info("[Text Entity and Relation Extraction] processing ...") - build_text_kg( - llm_client=self.llm_client, - kg_instance=self.graph_storage, - chunks=text_chunks, + logger.info( + "[Omics Entity and Relation Extraction] processing %d chunks (DNA/RNA/protein)...", + len(omics_chunks) ) - if len(mm_chunks) == 0: - logger.info("All multi-modal chunks are already in the storage") - else: - logger.info("[Multi-modal Entity and Relation Extraction] processing ...") - build_mm_kg( + build_omics_kg( llm_client=self.llm_client, kg_instance=self.graph_storage, - chunks=mm_chunks, + chunks=omics_chunks, ) self.graph_storage.index_done_callback() diff --git a/graphgen/operators/chunk/chunk_service.py b/graphgen/operators/chunk/chunk_service.py index 102c74fd..b6775764 100644 --- a/graphgen/operators/chunk/chunk_service.py +++ b/graphgen/operators/chunk/chunk_service.py @@ -42,14 +42,12 @@ def split_chunks(text: str, language: str = "en", **kwargs) -> list: class ChunkService(BaseOperator): - def __init__( - self, working_dir: str = "cache", kv_backend: str = "rocksdb", **chunk_kwargs - ): + def __init__(self, working_dir: str = "cache", **chunk_kwargs): super().__init__(working_dir=working_dir, op_name="chunk_service") tokenizer_model = os.getenv("TOKENIZER_MODEL", "cl100k_base") self.tokenizer_instance: Tokenizer = Tokenizer(model_name=tokenizer_model) self.chunk_storage = init_storage( - backend=kv_backend, + backend="rocksdb", working_dir=working_dir, namespace="chunk", ) diff --git a/graphgen/operators/generate/generate_service.py b/graphgen/operators/generate/generate_service.py index 1ae2f067..720b8488 100644 --- a/graphgen/operators/generate/generate_service.py +++ b/graphgen/operators/generate/generate_service.py @@ -7,36 +7,8 @@ AtomicGenerator, CoTGenerator, MultiHopGenerator, - VQAGenerator, -) -from graphgen.utils import logger, run_concurrent - - -class GenerateService(BaseOperator): - """ - Generate question-answer pairs based on nodes and edges. - """ - - def __init__( - self, - working_dir: str = "cache", - method: str = "aggregated", - data_format: str = "ChatML", - ): - super().__init__(working_dir=working_dir, op_name="generate_service") - self.llm_client: BaseLLMWrapper = init_llm("synthesizer") - - self.method = method - self.data_format = data_format - - if self.method == "atomic": - self.generator = AtomicGenerator(self.llm_client) - elif self.method == "aggregated": - self.generator = AggregatedGenerator(self.llm_client) - elif self.method == "multi_hop": - self.generator = MultiHopGenerator(self.llm_client) - elif self.method == "cot": - self.generator = CoTGenerator(self.llm_client) + elif self.method == "omics_qa": + self.generator = OmicsQAGenerator(self.llm_client) elif self.method in ["vqa"]: self.generator = VQAGenerator(self.llm_client) else: diff --git a/graphgen/operators/judge/judge_service.py b/graphgen/operators/judge/judge_service.py index c7693aec..35797084 100644 --- a/graphgen/operators/judge/judge_service.py +++ b/graphgen/operators/judge/judge_service.py @@ -11,11 +11,11 @@ class JudgeService(BaseOperator): """Service for judging graph edges and nodes using a trainee LLM.""" - def __init__(self, working_dir: str = "cache", graph_backend: str = "kuzu"): + def __init__(self, working_dir: str = "cache"): super().__init__(working_dir=working_dir, op_name="judge_service") self.llm_client: BaseLLMWrapper = init_llm("trainee") self.graph_storage: BaseGraphStorage = init_storage( - backend=graph_backend, + backend="kuzu", working_dir=working_dir, namespace="graph", ) diff --git a/graphgen/operators/partition/partition_service.py b/graphgen/operators/partition/partition_service.py index 2289fec6..a2d55213 100644 --- a/graphgen/operators/partition/partition_service.py +++ b/graphgen/operators/partition/partition_service.py @@ -18,21 +18,15 @@ class PartitionService(BaseOperator): - def __init__( - self, - working_dir: str = "cache", - graph_backend: str = "kuzu", - kv_backend: str = "rocksdb", - **partition_kwargs, - ): + def __init__(self, working_dir: str = "cache", **partition_kwargs): super().__init__(working_dir=working_dir, op_name="partition_service") self.kg_instance: BaseGraphStorage = init_storage( - backend=graph_backend, + backend="kuzu", working_dir=working_dir, namespace="graph", ) self.chunk_storage: BaseKVStorage = init_storage( - backend=kv_backend, + backend="rocksdb", working_dir=working_dir, namespace="chunk", ) @@ -70,8 +64,13 @@ def partition(self) -> Iterable[pd.DataFrame]: partitioner = LeidenPartitioner() elif method == "anchor_bfs": logger.info("Partitioning knowledge graph using Anchor BFS method.") + anchor_type = method_params.get("anchor_type") + if isinstance(anchor_type, list): + logger.info("Using multiple anchor types: %s", anchor_type) + else: + logger.info("Using single anchor type: %s", anchor_type) partitioner = AnchorBFSPartitioner( - anchor_type=method_params.get("anchor_type"), + anchor_type=anchor_type, anchor_ids=set(method_params.get("anchor_ids", [])) if method_params.get("anchor_ids") else None, @@ -137,15 +136,9 @@ def _attach_additional_data_to_node(self, batch: tuple) -> tuple: for node_id, node_data in nodes_data: entity_type = (node_data.get("entity_type") or "").lower() - if not entity_type: + if not source_ids: continue - source_ids = [ - sid.strip() - for sid in node_data.get("source_id", "").split("") - if sid.strip() - ] - # Handle images if "image" in entity_type: image_chunks = [ @@ -160,4 +153,5 @@ def _attach_additional_data_to_node(self, batch: tuple) -> tuple: node_data["image_data"] = json.loads(image_chunks[0]["content"]) logger.debug("Attached image data to node %s", node_id) + return nodes_data, edges_data diff --git a/graphgen/operators/quiz/quiz_service.py b/graphgen/operators/quiz/quiz_service.py index a6aeb7be..d3943cc8 100644 --- a/graphgen/operators/quiz/quiz_service.py +++ b/graphgen/operators/quiz/quiz_service.py @@ -12,8 +12,7 @@ class QuizService(BaseOperator): def __init__( self, working_dir: str = "cache", - graph_backend: str = "kuzu", - kv_backend: str = "rocksdb", +>>>>>>> feature/multi-omics-qa quiz_samples: int = 1, concurrency_limit: int = 200, ): @@ -21,11 +20,11 @@ def __init__( self.quiz_samples = quiz_samples self.llm_client: BaseLLMWrapper = init_llm("synthesizer") self.graph_storage: BaseGraphStorage = init_storage( - backend=graph_backend, working_dir=working_dir, namespace="graph" + backend="networkx", working_dir=working_dir, namespace="graph" ) # { _quiz_id: { "description": str, "quizzes": List[Tuple[str, str]] } } self.quiz_storage: BaseKVStorage = init_storage( - backend=kv_backend, working_dir=working_dir, namespace="quiz" + backend="rocksdb", working_dir=working_dir, namespace="quiz" ) self.generator = QuizGenerator(self.llm_client) self.concurrency_limit = concurrency_limit diff --git a/graphgen/operators/read/__init__.py b/graphgen/operators/read/__init__.py index cda44587..cd22453e 100644 --- a/graphgen/operators/read/__init__.py +++ b/graphgen/operators/read/__init__.py @@ -1 +1 @@ -from .read import read +from .read import read, read_files diff --git a/graphgen/operators/read/read.py b/graphgen/operators/read/read.py index fbed377e..e05f7e08 100644 --- a/graphgen/operators/read/read.py +++ b/graphgen/operators/read/read.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Any, List, Optional, Union +from typing import Any, Dict, Iterator, List, Optional, Union import ray @@ -126,3 +126,4 @@ def read( except Exception as e: logger.error("[READ] Failed to read files from %s: %s", input_path, e) raise + diff --git a/graphgen/operators/search/__init__.py b/graphgen/operators/search/__init__.py index 3d90f12a..47144c77 100644 --- a/graphgen/operators/search/__init__.py +++ b/graphgen/operators/search/__init__.py @@ -1 +1 @@ -from .search_all import search_all +from .search_service import SearchService diff --git a/graphgen/run.py b/graphgen/run.py index b0383867..e0efe7ea 100644 --- a/graphgen/run.py +++ b/graphgen/run.py @@ -1,5 +1,7 @@ import argparse +import logging import os +import sys import time from importlib import resources from typing import Any, Dict @@ -18,68 +20,7 @@ load_dotenv() - -def set_working_dir(folder): - os.makedirs(folder, exist_ok=True) - - -def save_config(config_path, global_config): - if not os.path.exists(os.path.dirname(config_path)): - os.makedirs(os.path.dirname(config_path)) - with open(config_path, "w", encoding="utf-8") as config_file: - yaml.dump( - global_config, config_file, default_flow_style=False, allow_unicode=True - ) - - -class NodeFilenameProvider(FilenameProvider): - def __init__(self, node_id: str): - self.node_id = node_id - - def get_filename_for_block( - self, block: Block, write_uuid: str, task_index: int, block_index: int - ) -> str: - # format: {node_id}_{write_uuid}_{task_index:06}_{block_index:06}.json - return f"{self.node_id}_{write_uuid}_{task_index:06d}_{block_index:06d}.jsonl" - - def get_filename_for_row( - self, - row: Dict[str, Any], - write_uuid: str, - task_index: int, - block_index: int, - row_index: int, - ) -> str: - raise NotImplementedError( - f"Row-based filenames are not supported by write_json. " - f"Node: {self.node_id}, write_uuid: {write_uuid}" - ) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--config_file", - help="Config parameters for GraphGen.", - default=resources.files("graphgen") - .joinpath("configs") - .joinpath("aggregated_config.yaml"), - type=str, - ) - - args = parser.parse_args() - - with open(args.config_file, "r", encoding="utf-8") as f: - config = yaml.load(f, Loader=yaml.FullLoader) - - working_dir = config.get("global_params", {}).get("working_dir", "cache") - unique_id = int(time.time()) - output_path = os.path.join(working_dir, "output", f"{unique_id}") - set_working_dir(output_path) - log_path = os.path.join(working_dir, "logs", "Driver.log") - driver_logger = set_logger( - log_path, - name="GraphGen", + console_level=logging.ERROR, if_stream=True, ) CURRENT_LOGGER_VAR.set(driver_logger) @@ -89,26 +30,53 @@ def main(): log_path, ) - engine = Engine(config, operators) - ds = ray.data.from_items([]) - results = engine.execute(ds) - - for node_id, dataset in results.items(): - output_path = os.path.join(output_path, f"{node_id}") - os.makedirs(output_path, exist_ok=True) - dataset.write_json( - output_path, - filename_provider=NodeFilenameProvider(node_id), - pandas_json_args_fn=lambda: { - "force_ascii": False, - "orient": "records", - "lines": True, - }, - ) - logger.info("Node %s results saved to %s", node_id, output_path) - - save_config(os.path.join(output_path, "config.yaml"), config) - logger.info("GraphGen completed successfully. Data saved to %s", output_path) + # Temporarily suppress non-error output (print statements, third-party libraries, Ray Data progress) + # Only redirect stdout to preserve stderr for logger error output + global _devnull + _devnull = open(os.devnull, 'w', encoding='utf-8') + sys.stdout = _devnull + + try: + engine = Engine(config, operators) + ds = ray.data.from_items([]) + results = engine.execute(ds) + + for node_id, dataset in results.items(): + node_output_path = os.path.join(output_path, f"{node_id}") + os.makedirs(node_output_path, exist_ok=True) + dataset.write_json( + node_output_path, + filename_provider=NodeFilenameProvider(node_id), + pandas_json_args_fn=lambda: { + "force_ascii": False, + "orient": "records", + "lines": True, + }, + ) + logger.info("Node %s results saved to %s", node_id, node_output_path) + + save_config(os.path.join(output_path, "config.yaml"), config) + logger.info("GraphGen completed successfully. Data saved to %s", output_path) + finally: + # Restore original stdout before printing results + sys.stdout = _original_stdout + if _devnull: + _devnull.close() + _devnull = None + + # Print save information to console + if 'results' in locals() and results: + print("\n" + "="*60) + print("GraphGen execution completed successfully!") + print("="*60) + for node_id, dataset in results.items(): + node_output_path = os.path.join(output_path, f"{node_id}") + print(f"✓ Node '{node_id}' results saved to: {node_output_path}") + print(f"✓ Config saved to: {os.path.join(output_path, 'config.yaml')}") + print(f"✓ Logs saved to: {log_path}") + print("="*60 + "\n") + else: + print("\n⚠️ Warning: No results were generated.\n") if __name__ == "__main__": diff --git a/graphgen/templates/__init__.py b/graphgen/templates/__init__.py index 0940e910..99e297ee 100644 --- a/graphgen/templates/__init__.py +++ b/graphgen/templates/__init__.py @@ -6,9 +6,15 @@ ATOMIC_GENERATION_PROMPT, COT_GENERATION_PROMPT, MULTI_HOP_GENERATION_PROMPT, + OMICS_QA_GENERATION_PROMPT, VQA_GENERATION_PROMPT, ) -from .kg import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT, MMKG_EXTRACTION_PROMPT +from .kg import ( + KG_EXTRACTION_PROMPT, + KG_SUMMARIZATION_PROMPT, + MMKG_EXTRACTION_PROMPT, + OMICS_KG_EXTRACTION_PROMPT, +) from .question_generation import QUESTION_GENERATION_PROMPT from .search_judgement import SEARCH_JUDGEMENT_PROMPT from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT diff --git a/graphgen/templates/extraction/schema_guided_extraction.py b/graphgen/templates/extraction/schema_guided_extraction.py index 710900e5..e96c3886 100644 --- a/graphgen/templates/extraction/schema_guided_extraction.py +++ b/graphgen/templates/extraction/schema_guided_extraction.py @@ -7,7 +7,7 @@ 3. Present your findings in JSON format as specified below. Important Notes: -- Extract only relevant information. +- Extract only relevant information. - Consider the context of the entire document when determining relevance. - Do not be verbose, only respond with the correct format and information. - Some docs may have multiple relevant excerpts -- include all that apply. diff --git a/graphgen/templates/generation/__init__.py b/graphgen/templates/generation/__init__.py index b58c2b6c..7e967b7b 100644 --- a/graphgen/templates/generation/__init__.py +++ b/graphgen/templates/generation/__init__.py @@ -2,4 +2,5 @@ from .atomic_generation import ATOMIC_GENERATION_PROMPT from .cot_generation import COT_GENERATION_PROMPT from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT + from .vqa_generation import VQA_GENERATION_PROMPT diff --git a/graphgen/templates/generation/atomic_generation.py b/graphgen/templates/generation/atomic_generation.py index 499100f7..141c40e4 100644 --- a/graphgen/templates/generation/atomic_generation.py +++ b/graphgen/templates/generation/atomic_generation.py @@ -1,6 +1,6 @@ # pylint: disable=C0301 TEMPLATE_EN: str = """You are given a text passage. Your task is to generate a question and answer (QA) pair based on the content of that text. -The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text. +The answer should be accurate and directly derived from the text. Make sure the QA pair is relevant to the main theme or important details of the given text. For example: Question: What is the effect of overexpressing the BG1 gene on grain size and development? Answer: Overexpression of the BG1 gene leads to significantly increased grain size, demonstrating its role in grain development. diff --git a/graphgen/templates/generation/cot_generation.py b/graphgen/templates/generation/cot_generation.py index 849a7c71..e179a704 100644 --- a/graphgen/templates/generation/cot_generation.py +++ b/graphgen/templates/generation/cot_generation.py @@ -106,7 +106,7 @@ 5. 推理路径生成 - 根据问题设计一个**可被后续模型直接执行的推理蓝图**。 -- 保持步骤最小化:每一步只解决一个“不可分割”的子问题。 +- 保持步骤最小化:每一步只解决一个“不可分割”的子问题。 -约束条件- 1. 不要在回答中描述你的思考过程,直接给出回复,只给出问题和推理路径设计,不要生成无关信息。 @@ -155,7 +155,7 @@ - The question must be verifiable directly within the graph through entities, relationships, or attributes; avoid subjective judgments. - The question should allow the model to think sufficiently, fully utilizing the entities and relationships in the graph, avoiding overly simple or irrelevant questions. -5. Reasoning-Path Design +5. Reasoning-Path Design - Output a **blueprint that any later model can directly execute**. - Keep steps minimal: each step solves one indivisible sub-problem. diff --git a/graphgen/templates/kg/__init__.py b/graphgen/templates/kg/__init__.py index ea865ce6..3f8b6ed2 100644 --- a/graphgen/templates/kg/__init__.py +++ b/graphgen/templates/kg/__init__.py @@ -1,3 +1,4 @@ from .kg_extraction import KG_EXTRACTION_PROMPT from .kg_summarization import KG_SUMMARIZATION_PROMPT from .mm_kg_extraction import MMKG_EXTRACTION_PROMPT + diff --git a/graphgen/templates/kg/mm_kg_extraction.py b/graphgen/templates/kg/mm_kg_extraction.py index 2805b98d..28327175 100644 --- a/graphgen/templates/kg/mm_kg_extraction.py +++ b/graphgen/templates/kg/mm_kg_extraction.py @@ -26,7 +26,7 @@ - target_entity: The name of the target entity identified in Step 1 - relationship_summary: Explain why you think the source entity and target entity are related to each other Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) - + 3. Return the output list of all entities and relationships identified in Steps 1 and 2 in English. Use **{record_delimiter}** as the list separator. 4. Upon completion, output {completion_delimiter} diff --git a/graphgen/templates/search_judgement.py b/graphgen/templates/search_judgement.py index 19b21840..06837ac8 100644 --- a/graphgen/templates/search_judgement.py +++ b/graphgen/templates/search_judgement.py @@ -1,7 +1,7 @@ # pylint: disable=C0301 TEMPLATE: str = """-Goal- -Please select the most relevant searcher result for the given entity. +Please select the most relevant searcher result for the given entity. The name and description of the entity are provided. The searcher results are provided as a list. Please select the most relevant searcher result from the list. If none of the searcher results are relevant, please select 'None of the above'. diff --git a/graphgen/utils/run_concurrent.py b/graphgen/utils/run_concurrent.py index d1a9b0e2..8315b953 100644 --- a/graphgen/utils/run_concurrent.py +++ b/graphgen/utils/run_concurrent.py @@ -1,6 +1,7 @@ import asyncio -from typing import Awaitable, Callable, List, TypeVar +from typing import Awaitable, Callable, List, Optional, TypeVar, Union +import gradio as gr from tqdm.asyncio import tqdm as tqdm_async from graphgen.utils.log import logger @@ -17,28 +18,105 @@ def run_concurrent( *, desc: str = "processing", unit: str = "item", -) -> List[R]: + progress_bar: Optional[gr.Progress] = None, + save_interval: int = 0, + save_callback: Optional[Callable[[List[R], int], None]] = None, + max_concurrent: Optional[int] = None, +) -> Union[List[R], Awaitable[List[R]]]: + """ + Run coroutines concurrently with optional periodic saving. + This function can be used in both sync and async contexts: + - In sync context: returns List[R] directly + - In async context: returns Awaitable[List[R]] (use with 'await') + :return: List of results (in sync context) or coroutine (in async context) + """ async def _run_all(): - tasks = [asyncio.create_task(coro_fn(item)) for item in items] + if not items: + return [] + # Use semaphore to limit concurrent tasks if max_concurrent is specified + semaphore = asyncio.Semaphore(max_concurrent) if max_concurrent is not None and max_concurrent > 0 else None + async def run_with_semaphore(item: T) -> R: + """Wrapper to apply semaphore if needed.""" + if semaphore: + async with semaphore: + return await coro_fn(item) + else: + return await coro_fn(item) + + # Create tasks with concurrency limit + if max_concurrent is not None and max_concurrent > 0: + # Use semaphore-controlled wrapper + tasks = [asyncio.create_task(run_with_semaphore(it)) for it in items] + else: + # Original behavior: create all tasks at once + tasks = [asyncio.create_task(coro_fn(it)) for it in items] + + completed_count = 0 results = [] + pending_save_results = [] pbar = tqdm_async(total=len(items), desc=desc, unit=unit) for future in asyncio.as_completed(tasks): try: result = await future results.append(result) - except Exception as e: + if save_interval > 0 and save_callback is not None: + pending_save_results.append(result) + except Exception as e: # pylint: disable=broad-except logger.exception("Task failed: %s", e) + # even if failed, record it to keep results consistent with tasks results.append(e) + completed_count += 1 pbar.update(1) + if progress_bar is not None: + progress = completed_count / len(items) + progress_bar(progress, desc=f"{desc} ({completed_count}/{len(items)})") + + # Periodic save + if save_interval > 0 and save_callback is not None and completed_count % save_interval == 0: + try: + # Filter out exceptions before saving + valid_results = [res for res in pending_save_results if not isinstance(res, Exception)] + save_callback(valid_results, completed_count) + pending_save_results = [] # Clear after saving + logger.info("Saved intermediate results: %d/%d completed", completed_count, len(items)) + except Exception as e: + logger.warning("Failed to save intermediate results: %s", e) + pbar.close() + + if progress_bar is not None: + progress_bar(1.0, desc=f"{desc} (completed)") + + # Save remaining results if any + if save_interval > 0 and save_callback is not None and pending_save_results: + try: + valid_results = [res for res in pending_save_results if not isinstance(res, Exception)] + save_callback(valid_results, completed_count) + logger.info("Saved final intermediate results: %d completed", completed_count) + except Exception as e: + logger.warning("Failed to save final intermediate results: %s", e) + + # filter out exceptions return [res for res in results if not isinstance(res, Exception)] - loop = create_event_loop() + # Check if we're in an async context (event loop is running) try: - return loop.run_until_complete(_run_all()) - finally: - loop.close() + _ = asyncio.get_running_loop() + # If we're in an async context, return the coroutine directly + # The caller should use 'await run_concurrent(...)' + return _run_all() + except RuntimeError: + # No running loop, we can create one and run until complete + if not items: + return [] + loop, created = create_event_loop() + try: + return loop.run_until_complete(_run_all()) + finally: + # Only close the loop if we created it + if created: + loop.close() diff --git a/graphgen/utils/wrap.py b/graphgen/utils/wrap.py index 57776f22..9689cea6 100644 --- a/graphgen/utils/wrap.py +++ b/graphgen/utils/wrap.py @@ -7,7 +7,15 @@ def async_to_sync_method(func: Callable) -> Callable: @wraps(func) def wrapper(self, *args, **kwargs) -> Any: - loop = create_event_loop() - return loop.run_until_complete(func(self, *args, **kwargs)) + loop, created = create_event_loop() + try: + if loop.is_running(): + raise RuntimeError( + "Cannot use async_to_sync_method when event loop is already running." + ) + return loop.run_until_complete(func(self, *args, **kwargs)) + finally: + if created: + loop.close() return wrapper From ce2b2961a260b9e0fb5b1b8b80914acdd510a357 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Thu, 18 Dec 2025 14:46:46 +0800 Subject: [PATCH 02/20] fix: remove remaining conflict markers --- graphgen/models/llm/local/vllm_wrapper.py | 1 - graphgen/models/searcher/db/ncbi_searcher.py | 1 - graphgen/models/searcher/db/rnacentral_searcher.py | 1 - graphgen/models/searcher/db/uniprot_searcher.py | 1 - graphgen/operators/quiz/quiz_service.py | 1 - 5 files changed, 5 deletions(-) diff --git a/graphgen/models/llm/local/vllm_wrapper.py b/graphgen/models/llm/local/vllm_wrapper.py index 40eb0d73..5d18e99c 100644 --- a/graphgen/models/llm/local/vllm_wrapper.py +++ b/graphgen/models/llm/local/vllm_wrapper.py @@ -8,7 +8,6 @@ class VLLMWrapper(BaseLLMWrapper): """ Async inference backend based on vLLM. """ ->>>>>>> feature/multi-omics-qa def __init__( self, model: str, diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index b43f63a2..4ddc8138 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -19,7 +19,6 @@ ) from graphgen.bases import BaseSearcher ->>>>>>> feature/multi-omics-qa @lru_cache(maxsize=None) diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index 3654b60d..b59d9f39 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -18,7 +18,6 @@ ) from graphgen.bases import BaseSearcher ->>>>>>> feature/multi-omics-qa @lru_cache(maxsize=None) diff --git a/graphgen/models/searcher/db/uniprot_searcher.py b/graphgen/models/searcher/db/uniprot_searcher.py index 31876c38..899b715d 100644 --- a/graphgen/models/searcher/db/uniprot_searcher.py +++ b/graphgen/models/searcher/db/uniprot_searcher.py @@ -19,7 +19,6 @@ ) from graphgen.bases import BaseSearcher ->>>>>>> feature/multi-omics-qa @lru_cache(maxsize=None) diff --git a/graphgen/operators/quiz/quiz_service.py b/graphgen/operators/quiz/quiz_service.py index d3943cc8..66dc1193 100644 --- a/graphgen/operators/quiz/quiz_service.py +++ b/graphgen/operators/quiz/quiz_service.py @@ -12,7 +12,6 @@ class QuizService(BaseOperator): def __init__( self, working_dir: str = "cache", ->>>>>>> feature/multi-omics-qa quiz_samples: int = 1, concurrency_limit: int = 200, ): From 6d0868a2fc264b226f14b1351d72149567eb041d Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Thu, 18 Dec 2025 18:46:06 +0800 Subject: [PATCH 03/20] fix: restore files accidentally modified --- examples/generate/generate_omics_qa/README.md | 216 +++++++++ .../generate_omics_qa/generate_omics_qa.sh | 3 + .../generate_omics_qa_searched.sh | 3 + .../generate_omics_qa/omics_qa_config.yaml | 93 ++++ .../omics_qa_config_searched.yaml | 73 +++ .../input_examples/searched_dna_demo.jsonl | 3 + .../searched_protein_demo.jsonl | 8 + .../input_examples/searched_rna_demo.jsonl | 6 + graphgen/bases/base_operator.py | 8 + graphgen/bases/base_reader.py | 3 +- graphgen/bases/base_searcher.py | 30 ++ graphgen/engine.py | 37 +- graphgen/models/generator/__init__.py | 2 +- .../models/generator/omics_qa_generator.py | 365 +++++++++++++++ graphgen/models/kg_builder/__init__.py | 2 +- .../models/kg_builder/omics_kg_builder.py | 291 ++++++++++++ .../partitioner/anchor_bfs_partitioner.py | 4 + graphgen/models/reader/json_reader.py | 38 ++ graphgen/models/searcher/db/ncbi_searcher.py | 142 ++++++ .../models/searcher/db/rnacentral_searcher.py | 10 + .../models/searcher/db/uniprot_searcher.py | 9 +- graphgen/models/storage/kv/json_storage.py | 43 ++ graphgen/operators/__init__.py | 2 +- .../operators/build_kg/build_kg_service.py | 29 ++ graphgen/operators/build_kg/build_omics_kg.py | 51 +++ .../operators/generate/generate_service.py | 31 ++ .../operators/partition/partition_service.py | 130 ++++++ graphgen/operators/read/__init__.py | 2 +- graphgen/operators/read/read.py | 3 +- .../operators/search/multi_omics_search.py | 29 ++ graphgen/operators/search/search_all.py | 83 ---- graphgen/operators/search/search_service.py | 428 ++++++++++++++++++ graphgen/run.py | 138 ++++-- graphgen/templates/generation/__init__.py | 2 +- .../generation/omics_qa_generation.py | 99 ++++ graphgen/templates/kg/__init__.py | 3 +- graphgen/templates/kg/omics_kg_extraction.py | 209 +++++++++ .../templates/kg/protein_kg_extraction.py | 144 ++++++ graphgen/utils/loop.py | 34 +- 39 files changed, 2642 insertions(+), 164 deletions(-) create mode 100644 examples/generate/generate_omics_qa/README.md create mode 100755 examples/generate/generate_omics_qa/generate_omics_qa.sh create mode 100755 examples/generate/generate_omics_qa/generate_omics_qa_searched.sh create mode 100644 examples/generate/generate_omics_qa/omics_qa_config.yaml create mode 100644 examples/generate/generate_omics_qa/omics_qa_config_searched.yaml create mode 100644 examples/input_examples/searched_dna_demo.jsonl create mode 100644 examples/input_examples/searched_protein_demo.jsonl create mode 100644 examples/input_examples/searched_rna_demo.jsonl create mode 100644 graphgen/models/generator/omics_qa_generator.py create mode 100644 graphgen/models/kg_builder/omics_kg_builder.py create mode 100644 graphgen/operators/build_kg/build_omics_kg.py create mode 100644 graphgen/operators/search/multi_omics_search.py delete mode 100644 graphgen/operators/search/search_all.py create mode 100644 graphgen/operators/search/search_service.py create mode 100644 graphgen/templates/generation/omics_qa_generation.py create mode 100644 graphgen/templates/kg/omics_kg_extraction.py create mode 100644 graphgen/templates/kg/protein_kg_extraction.py diff --git a/examples/generate/generate_omics_qa/README.md b/examples/generate/generate_omics_qa/README.md new file mode 100644 index 00000000..9aca9561 --- /dev/null +++ b/examples/generate/generate_omics_qa/README.md @@ -0,0 +1,216 @@ +# Multi-omics Knowledge Graph QA Generation + +This example demonstrates how to build knowledge graphs from multi-omics data (DNA, RNA, protein) and generate question-answer pairs using the unified `omics_qa` method. + +## Pipeline Overview + +The pipeline includes the following steps: + +1. **read**: Read input files (JSON/JSONL format with sequence queries or protein data) +2. **search**: Search biological databases (NCBI for DNA, RNAcentral for RNA, UniProt for protein) - *optional if input already contains search results* +3. **chunk**: Chunk sequences and metadata +4. **build_kg**: Extract entities and relationships to build knowledge graph +5. **partition**: Partition the knowledge graph into communities using anchor-based BFS +6. **generate**: Generate QA pairs from partitioned communities with automatic molecule caption extraction + +## Key Features + +- **Unified QA Generation**: Single `omics_qa` method supports DNA, RNA, and Protein +- **Automatic Caption Extraction**: Automatically extracts and attaches molecule-specific information (dna/rna/protein captions) to each QA pair +- **Flexible Configuration**: Easy to switch between DNA, RNA, and Protein by changing input file and data source +- **Anchor-based Partitioning**: Uses molecule type as anchor for BFS partitioning (dna/rna/protein) + +## Quick Start + +### 1. Configure Input Data + +Edit `omics_qa_config.yaml` to set the input file path: + +**For DNA:** +```yaml +input_path: + - examples/input_examples/search_dna_demo.jsonl +``` + +**For RNA:** +```yaml +input_path: + - examples/input_examples/search_rna_demo.jsonl +``` + +**For Protein:** +```yaml +input_path: + - examples/input_examples/search_protein_demo.jsonl +``` + +### 2. Configure Data Source + +Set the appropriate data source and parameters in the `search_data` node: + +**For DNA (NCBI):** +```yaml +data_sources: [ncbi] +ncbi_params: + email: your_email@example.com # Required! + tool: GraphGen + use_local_blast: true + local_blast_db: refseq_release/refseq_release + blast_num_threads: 2 + max_concurrent: 5 +``` + +**For RNA (RNAcentral):** +```yaml +data_sources: [rnacentral] +rnacentral_params: + use_local_blast: true + local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD + blast_num_threads: 2 + max_concurrent: 5 +``` + +**For Protein (UniProt):** +```yaml +data_sources: [uniprot] +uniprot_params: + use_local_blast: true + local_blast_db: ${RELEASE}/uniprot_sprot + blast_num_threads: 2 + max_concurrent: 5 +``` + +### 3. Configure Anchor Type + +Set the `anchor_type` in the `partition` node to match your molecule type: + +```yaml +partition: + params: + method: anchor_bfs + method_params: + anchor_type: protein # Change to "dna" or "rna" as needed + max_units_per_community: 10 +``` + +### 4. Run the Pipeline + +```bash +./generate_omics_qa.sh +``` + +Or run directly with Python: + +```bash +python3 -m graphgen.run \ + --config_file examples/generate/generate_omics_qa/omics_qa_config.yaml \ + --output_dir cache/ +``` + +## Input Format + +### For DNA/RNA (JSONL format): +```jsonl +{"type": "text", "content": "BRCA1"} +{"type": "text", "content": ">query\nATGCGATCG..."} +{"type": "text", "content": "ATGCGATCG..."} +``` + +### For Protein (JSONL format): +```jsonl +{"type": "text", "content": "P01308"} +{"type": "text", "content": "insulin"} +{"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} +``` + +## Output Format + +The `omics_qa` method automatically extracts and attaches molecule-specific captions to QA pairs: + +### Alpaca Format: +```json +{ + "instruction": "What is the function of this protein?", + "input": "", + "output": "The protein functions as...", + "dna": {...}, # DNA caption (if molecule_type is DNA) + "rna": {...}, # RNA caption (if molecule_type is RNA) + "protein": {...} # Protein caption (if molecule_type is protein) +} +``` + +### ChatML Format: +```json +{ + "messages": [ + { + "role": "user", + "content": [ + { + "text": "What is the function of this protein?", + "dna": {...}, + "rna": {...}, + "protein": {...} + } + ] + }, + { + "role": "assistant", + "content": "The protein functions as..." + } + ] +} +``` + +## Caption Information + +The generator automatically extracts relevant caption information based on molecule type: + +- **DNA**: gene_name, gene_description, organism, chromosome, genomic_location, function, gene_type, etc. +- **RNA**: rna_type, description, organism, related_genes, gene_name, so_term, modifications, etc. +- **Protein**: protein_name, gene_names, organism, function, sequence, entry_name, etc. + +## Configuration Options + +### Chunking Parameters +- `chunk_size`: Size for text metadata chunks (default: 1024) +- `chunk_overlap`: Overlap for text chunks (default: 100) +- `sequence_chunk_size`: Size for sequence chunks (default: 1000) +- `sequence_chunk_overlap`: Overlap for sequence chunks (default: 100) + +### Partition Parameters +- `method`: `anchor_bfs` (recommended for omics data) +- `anchor_type`: `dna`, `rna`, or `protein` (must match your data type) +- `max_units_per_community`: Maximum nodes and edges per community (default: 10) + +### Generation Parameters +- `method`: `omics_qa` (unified method for DNA/RNA/Protein) +- `data_format`: `Alpaca`, `ChatML`, or `Sharegpt` + +## Notes + +- **NCBI requires an email address** - Make sure to set `email` in `ncbi_params` +- **Anchor type must match molecule type** - Set `anchor_type` to match your data (dna/rna/protein) +- **Local BLAST** can be enabled if you have local databases set up (see `examples/search/build_db/`) +- **Caption extraction** is automatic - The generator detects molecule type and extracts relevant caption information +- Adjust `max_concurrent` based on your system resources and API rate limits + +## Examples + +### Generate QA for Protein Data +1. Set `input_path` to `examples/input_examples/search_protein_demo.jsonl` +2. Set `data_sources: [uniprot]` +3. Set `anchor_type: protein` +4. Run `./generate_omics_qa.sh` + +### Generate QA for DNA Data +1. Set `input_path` to `examples/input_examples/search_dna_demo.jsonl` +2. Set `data_sources: [ncbi]` +3. Set `anchor_type: dna` +4. Run `./generate_omics_qa.sh` + +### Generate QA for RNA Data +1. Set `input_path` to `examples/input_examples/search_rna_demo.jsonl` +2. Set `data_sources: [rnacentral]` +3. Set `anchor_type: rna` +4. Run `./generate_omics_qa.sh` diff --git a/examples/generate/generate_omics_qa/generate_omics_qa.sh b/examples/generate/generate_omics_qa/generate_omics_qa.sh new file mode 100755 index 00000000..3afb129e --- /dev/null +++ b/examples/generate/generate_omics_qa/generate_omics_qa.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.run \ + --config_file examples/generate/generate_omics_qa/omics_qa_config.yaml \ + --output_dir cache/ diff --git a/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh b/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh new file mode 100755 index 00000000..20b0b533 --- /dev/null +++ b/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.run \ + --config_file examples/generate/generate_omics_qa/omics_qa_config_searched.yaml \ + --output_dir cache/ diff --git a/examples/generate/generate_omics_qa/omics_qa_config.yaml b/examples/generate/generate_omics_qa/omics_qa_config.yaml new file mode 100644 index 00000000..22b9b26c --- /dev/null +++ b/examples/generate/generate_omics_qa/omics_qa_config.yaml @@ -0,0 +1,93 @@ +global_params: + working_dir: cache + graph_backend: kuzu # graph database backend, support: kuzu, networkx + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + +nodes: + - id: read_files + op_name: read + type: source + dependencies: [] + params: + input_path: + # three input files to generate DNA, RNA, and Protein data together + - examples/input_examples/search_dna_demo.jsonl + - examples/input_examples/search_rna_demo.jsonl + - examples/input_examples/search_protein_demo.jsonl + + - id: search_data + op_name: search + type: map_batch + dependencies: + - read_files + execution_params: + replicas: 1 + batch_size: 10 + params: + data_sources: [ncbi, rnacentral, uniprot] # Multi-omics: use all three data sources + # DNA search parameters + ncbi_params: + email: your_email@example.com # Required for NCBI + tool: GraphGen + use_local_blast: true + local_blast_db: databases/refseq_232_old/refseq_232 + blast_num_threads: 2 + max_concurrent: 5 + # RNA search parameters + rnacentral_params: + use_local_blast: true + local_blast_db: databases/rnacentral_merged_20251213/rnacentral_merged_20251213 + blast_num_threads: 2 + max_concurrent: 5 + # Protein search parameters + uniprot_params: + use_local_blast: true + # local_blast_db: ${RELEASE}/uniprot_sprot + local_blast_db: databases/2025_04/uniprot_sprot + blast_num_threads: 2 + max_concurrent: 5 + + - id: chunk_documents + op_name: chunk + type: map_batch + dependencies: + - search_data + execution_params: + replicas: 4 + params: + chunk_size: 1024 # chunk size for text splitting + chunk_overlap: 100 # chunk overlap for text splitting + sequence_chunk_size: 1000 # For sequence chunks (bp for DNA/RNA, aa for protein) + sequence_chunk_overlap: 100 + + - id: build_kg + op_name: build_kg + type: map_batch + dependencies: + - chunk_documents + execution_params: + replicas: 1 + batch_size: 128 + + - id: partition + op_name: partition + type: aggregate + dependencies: + - build_kg + params: + method: anchor_bfs # partition method + method_params: + anchor_type: [dna, rna, protein] # Multi-omics: support multiple anchor types (list or single string) + max_units_per_community: 10 # max nodes and edges per community + + - id: generate + op_name: generate + type: map_batch + dependencies: + - partition + execution_params: + replicas: 1 + batch_size: 128 + params: + method: omics_qa # unified QA generation method for DNA/RNA/Protein + data_format: ChatML # Alpaca, Sharegpt, ChatML diff --git a/examples/generate/generate_omics_qa/omics_qa_config_searched.yaml b/examples/generate/generate_omics_qa/omics_qa_config_searched.yaml new file mode 100644 index 00000000..cf01bc65 --- /dev/null +++ b/examples/generate/generate_omics_qa/omics_qa_config_searched.yaml @@ -0,0 +1,73 @@ +global_params: + working_dir: cache + graph_backend: kuzu # graph database backend, support: kuzu, networkx + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + +nodes: + - id: read_files + op_name: read + type: source + dependencies: [] + params: + input_path: + # Use pre-searched data files (skip search step) + # The search_service will automatically detect and skip search if data already contains search results + - examples/input_examples/searched_dna_demo.jsonl + - examples/input_examples/searched_rna_demo.jsonl + - examples/input_examples/searched_protein_demo.jsonl + + - id: search_data + op_name: search + type: map_batch + dependencies: + - read_files + execution_params: + replicas: 1 + batch_size: 10 + # Note: search_service will automatically detect pre-searched data and skip search, + # but it will still normalize the data format (ensure _doc_id, content, data_source fields exist) + + - id: chunk_documents + op_name: chunk + type: map_batch + dependencies: + - search_data + execution_params: + replicas: 4 + params: + chunk_size: 1024 # chunk size for text splitting + chunk_overlap: 100 # chunk overlap for text splitting + sequence_chunk_size: 1000 # For sequence chunks (bp for DNA/RNA, aa for protein) + sequence_chunk_overlap: 100 + + - id: build_kg + op_name: build_kg + type: map_batch + dependencies: + - chunk_documents + execution_params: + replicas: 1 + batch_size: 128 + + - id: partition + op_name: partition + type: aggregate + dependencies: + - build_kg + params: + method: anchor_bfs # partition method + method_params: + anchor_type: [dna, rna, protein] # Multi-omics: support multiple anchor types (list or single string) + max_units_per_community: 10 # max nodes and edges per community + + - id: generate + op_name: generate + type: map_batch + dependencies: + - partition + execution_params: + replicas: 1 + batch_size: 128 + params: + method: omics_qa # unified QA generation method for DNA/RNA/Protein + data_format: ChatML # Alpaca, Sharegpt, ChatML diff --git a/examples/input_examples/searched_dna_demo.jsonl b/examples/input_examples/searched_dna_demo.jsonl new file mode 100644 index 00000000..05778743 --- /dev/null +++ b/examples/input_examples/searched_dna_demo.jsonl @@ -0,0 +1,3 @@ +{"_doc_id":"doc-NG_011079","type":"dna","content":"Title: Homo sapiens ribosomal protein L35a pseudogene 6 (RPL35AP6) on chromosome 1\nSequence: ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_011079","gene_name":"RPL35AP6","gene_description":"ribosomal protein L35a pseudogene 6","organism":"Homo sapiens","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_011079","gene_synonyms":["RPL35A_3_191"],"gene_type":"other","chromosome":"1","genomic_location":"1-522","function":null,"title":"Homo sapiens ribosomal protein L35a pseudogene 6 (RPL35AP6) on chromosome 1","sequence":"ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG","sequence_length":522,"gene_id":"100271312","molecule_type_detail":"genomic region","_search_query":"ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"} +{"_doc_id":"doc-NG_033923","type":"dna","content":"Title: Callithrix jacchus immunity-related GTPase family, M, pseudogene (IRGMP) on chromosome 2\nSequence: GAACTCCTGACCTCAGGTGATCCACCTGCTTTGGCCTCCCAAAATGCCAGGATTACAGGTATGAGCCACCACGCCCAGCCAGCATTGGGGTATATCGAAGGCAGAGGTCATGAATGTTGAGAGAGCCTCAGCAGATGGGGACTTGCCAGAGGTGGTCTCTGCCATCAAGGAGAGTTTGAAGATAGTGTTCAGGACACCAGTCAACATCGCTATGGCAGGGGACTCTGGCAATAGCATATCCACCTTCATCAGTGCACTTCAAATCGCAGGGCATGAGGCGAAGGCCTCACCTCCTACTGGGCTGGTAAAAGCTACCCAAAGATGTGCCTCCTATTTCTCTTCCCGCTTTCCAAATGTGGTGCTGTGGGATCTGCCTGGAGCAGGGTCTGCCACCAAAACTCTGGAGAACTACCTGATGGAAATGTAGTTCAACCAATATGACTTCATCATGGTTGCATCTGCACAATTCAGCATGAATCATGTGATCCTTGCCAAAACCATTGAGGACATGGGAAAGAAGTTCTACATTGTCTGGACCAAGCTGGACATGGATCTCAGCACAGGTGCCCTCCCAGAAGTGCAGCTACTGTAAATCAGAGAAAATGTCCTGGAAAGTCTCCAGAGGGAGCAGGTATGTGAACTCCCCATATTTATGGCCTCCAGCCTTGAACCTTTATTGCATGACTTCCCAAAGCTTAGAGACACATTGCAAAAGACTCATCCAAATTAGGTGCCATGGCCCTCTTCAAAACCTGTCCCACACCTGTGAGATGATCACGAATGACAAAGCAATCTCCCTGCAGAAGAAAACAACCATACAGTCTTTCCAG","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_033923","gene_name":"IRGMP","gene_description":"immunity-related GTPase family, M, pseudogene","organism":"Callithrix jacchus","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_033923","gene_synonyms":null,"gene_type":"other","chromosome":"2","genomic_location":"1-830","function":null,"title":"Callithrix jacchus immunity-related GTPase family, M, pseudogene (IRGMP) on chromosome 2","sequence":"GAACTCCTGACCTCAGGTGATCCACCTGCTTTGGCCTCCCAAAATGCCAGGATTACAGGTATGAGCCACCACGCCCAGCCAGCATTGGGGTATATCGAAGGCAGAGGTCATGAATGTTGAGAGAGCCTCAGCAGATGGGGACTTGCCAGAGGTGGTCTCTGCCATCAAGGAGAGTTTGAAGATAGTGTTCAGGACACCAGTCAACATCGCTATGGCAGGGGACTCTGGCAATAGCATATCCACCTTCATCAGTGCACTTCAAATCGCAGGGCATGAGGCGAAGGCCTCACCTCCTACTGGGCTGGTAAAAGCTACCCAAAGATGTGCCTCCTATTTCTCTTCCCGCTTTCCAAATGTGGTGCTGTGGGATCTGCCTGGAGCAGGGTCTGCCACCAAAACTCTGGAGAACTACCTGATGGAAATGTAGTTCAACCAATATGACTTCATCATGGTTGCATCTGCACAATTCAGCATGAATCATGTGATCCTTGCCAAAACCATTGAGGACATGGGAAAGAAGTTCTACATTGTCTGGACCAAGCTGGACATGGATCTCAGCACAGGTGCCCTCCCAGAAGTGCAGCTACTGTAAATCAGAGAAAATGTCCTGGAAAGTCTCCAGAGGGAGCAGGTATGTGAACTCCCCATATTTATGGCCTCCAGCCTTGAACCTTTATTGCATGACTTCCCAAAGCTTAGAGACACATTGCAAAAGACTCATCCAAATTAGGTGCCATGGCCCTCTTCAAAACCTGTCCCACACCTGTGAGATGATCACGAATGACAAAGCAATCTCCCTGCAGAAGAAAACAACCATACAGTCTTTCCAG","sequence_length":830,"gene_id":"100409682","molecule_type_detail":"genomic region","_search_query":"NG_033923"} +{"_doc_id":"doc-NG_056118","type":"dna","content":"Title: Homo sapiens major histocompatibility complex, class II, DR beta 8 (pseudogene) (HLA-DRB8) on chromosome 6\nSequence: GCCAGAGCCTAGGTTTACAGAGAAGCAGACAAACAAAACAGCCAAACAAGGAGACTTACTCTGTCTTCATGACTCATTCCCTCTACATTTTTTCTTCTAGTCCATCCTAAGGTGACTGTGTATCCTTTAAAGACCCAGCCCCTGCAGCACCACAACCTCCTGGTCTGCTCTGTGAGTGGTTTCTGTCCAGCCAGCATTGAAGTCAGGTGGTTCCGGAACGGCCAGGAAGAGAAGGCTGGGGTGGTGTCCACAGGCCTGATCCAGAATGGAGACTGGACCTTCCAGACACTGATGATGCTGGAAACAGTTCCTCAGAGTGGAGAGGTTTACACCTGCCAAGTGGAGCATCCAAGCATGATGAGCCCTCTCACGGTGCAATGGAGTTAGCAGCTTTCTGACTTCATAAATTTTTCACCCAGTAAGTACAGGACTGTGCTAATCCCTGAGTGTCAGGTTTCTCCTCTCCCACATCCTATTTTCATTTGCTCCATATTCTCATCTCCATCAGCACAGGTCACTGGGGATAGCCCTGTAATCATTTCTAAAAGCACCTGTACCCCATGGTAAAGCAGTCATGCCTGCCAGGCGGGAGAGGCTGTCTCTCTTTTGAACCTCCCCATGATGGCACAGGTCAGGGTCACCCACTCTCCCTGGCTCCAGGCCCTGCCTCTGGGTCTGAGATTGTATTTCTGCTGCTGTTGCTCTGGGTTGTTTGTTGTGATCTGAGAAGAGGAGAACTGTAGGGGTCTTCCTGGCATGAGGGGAGTCCAATCCCAGCTCTGCCTTTTATTAGCTCTGTCACTCTAGACAAACTACTAAACCTCTTTGAGTCTCAGGATTTCTGTGGATCAGATGTCAAAGTCATGCCTTACATCAAGGCTGTAATATTTGAATGAGTTTGAGGCCTAACCTTGTAACTGTTCAGTGTGATCTGAAAACCTTTTTTCCCCAGAAATAGCTAGTTATTTTAGTTCTTGCAGGGCAGCCTTCTTCCCCATTTTCAAAGCTCTGAATCTCAGTATCTCAATTACAGAGGTTCAATTTGGGATAAAAATCACTAAACCTGGCTTCCACTCTCAGGAGCATGGTCTGAATCTGCACAGAGCAAGATGCTGAGTGGAGTCGGGGGCTTTGTGCTGGGCCTGCTCTTCCTTGGGGCCGGGCTGTTTCTCTACTTCAGGAATCAGAAAGGTGAGGAACCTTTCGTAGCTGGCTCTCTCCATAGACTTTTCTGGAGGAGGAAATATGGCTTTGCAGAGGTTAGTTCTCAGTATATGAGTGGCCCTGGATAAAGCCTTTCTTTCCCAAAACGACCTCCAATGTCCCGCTAATCCAGAAATCATCAGTGCATGGTTACTATGTCAAAGCATAATAGCTTATGGCCTGCAGAGAGAAAAGAAAGGCTAACAAGTAGGGATCCTTTGGTTGGAGATCCTGGAGCAAATTAAGGAAGAGCCACTAAGGTTAATACAATTACACTGGATCCTATGACAGACACTTCACGCTTCAGGGGTCACGTGGTGAGTTTCTGCTCCTCTCTGCCCTGGTTCATGTAAGTTGTGGTGTTAGAGAAATCTCAGGTGGGAGATCTGGGGCTGGGATATTGTGTTGGAGGACAGATTTGCTTCCATATCTTTTTTCTTTTTTCTTTTTTTTGAGACGGAGTCTCGCTCTGTCCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAACCTCCTTCTCCCGGATTCAAGTGATTCTCCTGCCTCAACCTCCCGAGTAGCTGGGACTATAGGCACCTGCCACCACGCCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAAGATGGTCTCGATCTCTTGACCTTGTGATCCACCCAACTTGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACCGCACCCGGCCTGCTTCCATATCTTTTAAATGTGTATCTTTTCCCCTTTTTCCCAGGACACTCTGGACTTCAGCCAACAGGTAATACCTTTTCATTCTCTTTTAGAAACAGATTCGCTTTCCTAGAATGATGGTAGAGGTGATAAGGGATGAGACAGAAATAATAGGAAAGACTTTGGATCCAAATTTCTGATCAGGCAATTTACGCCAAAACTCCTCTCTACTTAGAAAAGGCCTGTGCTTGGCCAGGCGCAGTAGCTCATGCCTGTAATCTCAGCACTTTGGGAGGCTGAGGCGGGTGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGACCAACAAGGAGAAACCTTGTCTCTACTAAAAATACAAAAAAAATTAGCCATGCGTGGTGGCGCATGCCTGTAATTCCAGCTACTGAGGAGGCTGAGGTAGGAGAATGGTTTGAAGCTGGGAGGCAGAGGTTGTGGTAAGCGCACCACTGCACTCCAGCCTGGGCAACAAGAGTGAAACTCCATCTGAAAAAATGAATAAATAAAAAATAAAAGGCCAGTGCTCTGCAGTAGTATTGGCTCAGGGAGACTTAGCAACTTGTTTTTCTTCTTCCTGTACTGCTTTCATCTGAGTCCCTGAAAGAGGGGGAAAGAAGCTGTTAGTAGAGCCATGTCTGAAAACAACACTCTCCTGTGTCTTCTGCAGGACTCCTGAACTGAAGTGAAGATGACCACATTCAAGGAGGAAACTTCTGCCCCAGCTTTGCAGGAGGAAAAGCTTTTCCGCTTGGCTCTTTTTTTTTTTTTTAGTTTTATTTAT","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_056118","gene_name":"HLA-DRB8","gene_description":"major histocompatibility complex, class II, DR beta 8 (pseudogene)","organism":"Homo sapiens","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_056118","gene_synonyms":null,"gene_type":"other","chromosome":"6","genomic_location":"1-2737","function":null,"title":"Homo sapiens major histocompatibility complex, class II, DR beta 8 (pseudogene) (HLA-DRB8) on chromosome 6","sequence":"GCCAGAGCCTAGGTTTACAGAGAAGCAGACAAACAAAACAGCCAAACAAGGAGACTTACTCTGTCTTCATGACTCATTCCCTCTACATTTTTTCTTCTAGTCCATCCTAAGGTGACTGTGTATCCTTTAAAGACCCAGCCCCTGCAGCACCACAACCTCCTGGTCTGCTCTGTGAGTGGTTTCTGTCCAGCCAGCATTGAAGTCAGGTGGTTCCGGAACGGCCAGGAAGAGAAGGCTGGGGTGGTGTCCACAGGCCTGATCCAGAATGGAGACTGGACCTTCCAGACACTGATGATGCTGGAAACAGTTCCTCAGAGTGGAGAGGTTTACACCTGCCAAGTGGAGCATCCAAGCATGATGAGCCCTCTCACGGTGCAATGGAGTTAGCAGCTTTCTGACTTCATAAATTTTTCACCCAGTAAGTACAGGACTGTGCTAATCCCTGAGTGTCAGGTTTCTCCTCTCCCACATCCTATTTTCATTTGCTCCATATTCTCATCTCCATCAGCACAGGTCACTGGGGATAGCCCTGTAATCATTTCTAAAAGCACCTGTACCCCATGGTAAAGCAGTCATGCCTGCCAGGCGGGAGAGGCTGTCTCTCTTTTGAACCTCCCCATGATGGCACAGGTCAGGGTCACCCACTCTCCCTGGCTCCAGGCCCTGCCTCTGGGTCTGAGATTGTATTTCTGCTGCTGTTGCTCTGGGTTGTTTGTTGTGATCTGAGAAGAGGAGAACTGTAGGGGTCTTCCTGGCATGAGGGGAGTCCAATCCCAGCTCTGCCTTTTATTAGCTCTGTCACTCTAGACAAACTACTAAACCTCTTTGAGTCTCAGGATTTCTGTGGATCAGATGTCAAAGTCATGCCTTACATCAAGGCTGTAATATTTGAATGAGTTTGAGGCCTAACCTTGTAACTGTTCAGTGTGATCTGAAAACCTTTTTTCCCCAGAAATAGCTAGTTATTTTAGTTCTTGCAGGGCAGCCTTCTTCCCCATTTTCAAAGCTCTGAATCTCAGTATCTCAATTACAGAGGTTCAATTTGGGATAAAAATCACTAAACCTGGCTTCCACTCTCAGGAGCATGGTCTGAATCTGCACAGAGCAAGATGCTGAGTGGAGTCGGGGGCTTTGTGCTGGGCCTGCTCTTCCTTGGGGCCGGGCTGTTTCTCTACTTCAGGAATCAGAAAGGTGAGGAACCTTTCGTAGCTGGCTCTCTCCATAGACTTTTCTGGAGGAGGAAATATGGCTTTGCAGAGGTTAGTTCTCAGTATATGAGTGGCCCTGGATAAAGCCTTTCTTTCCCAAAACGACCTCCAATGTCCCGCTAATCCAGAAATCATCAGTGCATGGTTACTATGTCAAAGCATAATAGCTTATGGCCTGCAGAGAGAAAAGAAAGGCTAACAAGTAGGGATCCTTTGGTTGGAGATCCTGGAGCAAATTAAGGAAGAGCCACTAAGGTTAATACAATTACACTGGATCCTATGACAGACACTTCACGCTTCAGGGGTCACGTGGTGAGTTTCTGCTCCTCTCTGCCCTGGTTCATGTAAGTTGTGGTGTTAGAGAAATCTCAGGTGGGAGATCTGGGGCTGGGATATTGTGTTGGAGGACAGATTTGCTTCCATATCTTTTTTCTTTTTTCTTTTTTTTGAGACGGAGTCTCGCTCTGTCCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAACCTCCTTCTCCCGGATTCAAGTGATTCTCCTGCCTCAACCTCCCGAGTAGCTGGGACTATAGGCACCTGCCACCACGCCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAAGATGGTCTCGATCTCTTGACCTTGTGATCCACCCAACTTGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACCGCACCCGGCCTGCTTCCATATCTTTTAAATGTGTATCTTTTCCCCTTTTTCCCAGGACACTCTGGACTTCAGCCAACAGGTAATACCTTTTCATTCTCTTTTAGAAACAGATTCGCTTTCCTAGAATGATGGTAGAGGTGATAAGGGATGAGACAGAAATAATAGGAAAGACTTTGGATCCAAATTTCTGATCAGGCAATTTACGCCAAAACTCCTCTCTACTTAGAAAAGGCCTGTGCTTGGCCAGGCGCAGTAGCTCATGCCTGTAATCTCAGCACTTTGGGAGGCTGAGGCGGGTGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGACCAACAAGGAGAAACCTTGTCTCTACTAAAAATACAAAAAAAATTAGCCATGCGTGGTGGCGCATGCCTGTAATTCCAGCTACTGAGGAGGCTGAGGTAGGAGAATGGTTTGAAGCTGGGAGGCAGAGGTTGTGGTAAGCGCACCACTGCACTCCAGCCTGGGCAACAAGAGTGAAACTCCATCTGAAAAAATGAATAAATAAAAAATAAAAGGCCAGTGCTCTGCAGTAGTATTGGCTCAGGGAGACTTAGCAACTTGTTTTTCTTCTTCCTGTACTGCTTTCATCTGAGTCCCTGAAAGAGGGGGAAAGAAGCTGTTAGTAGAGCCATGTCTGAAAACAACACTCTCCTGTGTCTTCTGCAGGACTCCTGAACTGAAGTGAAGATGACCACATTCAAGGAGGAAACTTCTGCCCCAGCTTTGCAGGAGGAAAAGCTTTTCCGCTTGGCTCTTTTTTTTTTTTTTAGTTTTATTTAT","sequence_length":2737,"gene_id":"3130","molecule_type_detail":"genomic region","_search_query":"NG_056118"} diff --git a/examples/input_examples/searched_protein_demo.jsonl b/examples/input_examples/searched_protein_demo.jsonl new file mode 100644 index 00000000..47ab02ad --- /dev/null +++ b/examples/input_examples/searched_protein_demo.jsonl @@ -0,0 +1,8 @@ +{"_doc_id":"doc-P01308","type":"protein","content":"Function: ['Insulin decreases blood glucose concentration. It increases cell permeability to monosaccharides, amino acids and fatty acids. It accelerates glycolysis, the pentose phosphate cycle, and glycogen synthesis in liver.']\nSequence: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P01308","entry_name":"INS_HUMAN","gene_names":[{"Name":"INS"}],"protein_name":"Insulin","organism":"Homo sapiens","sequence":"MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN","function":["Insulin decreases blood glucose concentration. It increases cell permeability to monosaccharides, amino acids and fatty acids. It accelerates glycolysis, the pentose phosphate cycle, and glycogen synthesis in liver."],"url":"https:\/\/www.uniprot.org\/uniprot\/P01308","_search_query":"P01308"} +{"_doc_id":"doc-Q6UWZ7","type":"protein","content":"Function: [\"Involved in DNA damage response and double-strand break (DSB) repair. Component of the BRCA1-A complex, acting as a central scaffold protein that assembles the various components of the complex and mediates the recruitment of BRCA1. The BRCA1-A complex specifically recognizes 'Lys-63'-linked ubiquitinated histones H2A and H2AX at DNA lesion sites, leading to target the BRCA1-BARD1 heterodimer to sites of DNA damage at DSBs. This complex also possesses deubiquitinase activity that specifically removes 'Lys-63'-linked ubiquitin on histones H2A and H2AX. {ECO:0000269|PubMed:17525340, ECO:0000269|PubMed:17643121, ECO:0000269|PubMed:17643122, ECO:0000269|PubMed:18077395, ECO:0000269|PubMed:19261748, ECO:0000269|PubMed:22357538, ECO:0000269|PubMed:26778126}.\"]\nSequence: MEGESTSAVLSGFVLGALAFQHLNTDSDTEGFLLGEVKGEAKNSITDSQMDDVEVVYTIDIQKYIPCYQLFSFYNSSGEVNEQALKKILSNVKKNVVGWYKFRRHSDQIMTFRERLLHKNLQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPLVVANLGMSEQLGYKTVSGSCMSTGFSRAVQTHSSKFFEEDGSLKEVHKINEMYASLQEELKSICKKVEDSEQAVDKLVKDVNRLKREIEKRRGAQIQAAREKNIQKDPQENIFLCQALRTFFPNSEFLHSCVMSLKNRHVSKSSCNYNHHLDVVDNLTLMVEHTDIPEASPASTPQIIKHKALDLDDRWQFKRSRLLDTQDKRSKADTGSSNQDKASKMSSPETDEEIEKMKGFGEYSRSPTF","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"Q6UWZ7","entry_name":"ABRX1_HUMAN","gene_names":[{"Name":"ABRAXAS1 {ECO:0000312|HGNC:HGNC:25829}","Synonyms":["ABRA1 {ECO:0000312|HGNC:HGNC:25829}","CCDC98","FAM175A {ECO:0000312|HGNC:HGNC:25829}"],"ORFNames":["UNQ496\/PRO1013"]}],"protein_name":"BRCA1-A complex subunit Abraxas 1 {ECO:0000312|HGNC:HGNC:25829}","organism":"Homo sapiens","sequence":"MEGESTSAVLSGFVLGALAFQHLNTDSDTEGFLLGEVKGEAKNSITDSQMDDVEVVYTIDIQKYIPCYQLFSFYNSSGEVNEQALKKILSNVKKNVVGWYKFRRHSDQIMTFRERLLHKNLQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPLVVANLGMSEQLGYKTVSGSCMSTGFSRAVQTHSSKFFEEDGSLKEVHKINEMYASLQEELKSICKKVEDSEQAVDKLVKDVNRLKREIEKRRGAQIQAAREKNIQKDPQENIFLCQALRTFFPNSEFLHSCVMSLKNRHVSKSSCNYNHHLDVVDNLTLMVEHTDIPEASPASTPQIIKHKALDLDDRWQFKRSRLLDTQDKRSKADTGSSNQDKASKMSSPETDEEIEKMKGFGEYSRSPTF","function":["Involved in DNA damage response and double-strand break (DSB) repair. Component of the BRCA1-A complex, acting as a central scaffold protein that assembles the various components of the complex and mediates the recruitment of BRCA1. The BRCA1-A complex specifically recognizes 'Lys-63'-linked ubiquitinated histones H2A and H2AX at DNA lesion sites, leading to target the BRCA1-BARD1 heterodimer to sites of DNA damage at DSBs. This complex also possesses deubiquitinase activity that specifically removes 'Lys-63'-linked ubiquitin on histones H2A and H2AX. {ECO:0000269|PubMed:17525340, ECO:0000269|PubMed:17643121, ECO:0000269|PubMed:17643122, ECO:0000269|PubMed:18077395, ECO:0000269|PubMed:19261748, ECO:0000269|PubMed:22357538, ECO:0000269|PubMed:26778126}."],"url":"https:\/\/www.uniprot.org\/uniprot\/Q6UWZ7","_search_query":"BRCA1"} +{"_doc_id":"doc-P27355","type":"protein","content":"Function: ['Responsible for the initial oxygenation of methane to methanol in methanotrophs. It also catalyzes the monohydroxylation of a variety of unactivated alkenes, alicyclic, aromatic and heterocyclic compounds.']\nSequence: MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDRAAVEATWIAKIKAAKSKYEADGIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKEPGVKVLHLQA","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P27355","entry_name":"MEMG_METTR","gene_names":[{"Name":"mmoZ"}],"protein_name":"Methane monooxygenase component A gamma chain","organism":"Methylosinus trichosporium.","sequence":"MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDRAAVEATWIAKIKAAKSKYEADGIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKEPGVKVLHLQA","function":["Responsible for the initial oxygenation of methane to methanol in methanotrophs. It also catalyzes the monohydroxylation of a variety of unactivated alkenes, alicyclic, aromatic and heterocyclic compounds."],"url":"https:\/\/www.uniprot.org\/uniprot\/P27355","_search_query":"MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"} +{"_doc_id":"doc-Q96GG9","type":"protein","content":"Function: ['Part of an E3 ubiquitin ligase complex for neddylation (PubMed:18826954). Promotes neddylation of cullin components of E3 cullin-RING ubiquitin ligase complexes (PubMed:19617556, PubMed:23201271, PubMed:23401859, PubMed:26906416). Acts by binding to cullin-RBX1 complexes in the cytoplasm and promoting their nuclear translocation, enhancing recruitment of E2-NEDD8 (UBE2M-NEDD8) thioester to the complex, and optimizing the orientation of proteins in the complex to allow efficient transfer of NEDD8 from the E2 to the cullin substrates. Involved in the release of inhibitory effets of CAND1 on cullin-RING ligase E3 complex assembly and activity (PubMed:25349211, PubMed:28581483). Also acts as an oncogene facilitating malignant transformation and carcinogenic progression (By similarity). {ECO:0000250|UniProtKB:Q9QZ73, ECO:0000269|PubMed:18826954, ECO:0000269|PubMed:19617556, ECO:0000269|PubMed:23201271, ECO:0000269|PubMed:23401859, ECO:0000269|PubMed:25349211, ECO:0000269|PubMed:26906416, ECO:0000269|PubMed:28581483}.']\nSequence: MNKLKSSQKDKVRQFMIFTQSSEKTAVSCLSQNDWKLDVATDNFFQNPELYIRESVKGSLDRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"Q96GG9","entry_name":"DCNL1_HUMAN","gene_names":[{"Name":"DCUN1D1 {ECO:0000312|HGNC:HGNC:18184}","Synonyms":["DCN1 {ECO:0000303|PubMed:28581483}","DCUN1L1","RP42","SCCRO"]}],"protein_name":"DCN1-like protein 1 {ECO:0000305}","organism":"Homo sapiens","sequence":"MNKLKSSQKDKVRQFMIFTQSSEKTAVSCLSQNDWKLDVATDNFFQNPELYIRESVKGSLDRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV","function":["Part of an E3 ubiquitin ligase complex for neddylation (PubMed:18826954). Promotes neddylation of cullin components of E3 cullin-RING ubiquitin ligase complexes (PubMed:19617556, PubMed:23201271, PubMed:23401859, PubMed:26906416). Acts by binding to cullin-RBX1 complexes in the cytoplasm and promoting their nuclear translocation, enhancing recruitment of E2-NEDD8 (UBE2M-NEDD8) thioester to the complex, and optimizing the orientation of proteins in the complex to allow efficient transfer of NEDD8 from the E2 to the cullin substrates. Involved in the release of inhibitory effets of CAND1 on cullin-RING ligase E3 complex assembly and activity (PubMed:25349211, PubMed:28581483). Also acts as an oncogene facilitating malignant transformation and carcinogenic progression (By similarity). {ECO:0000250|UniProtKB:Q9QZ73, ECO:0000269|PubMed:18826954, ECO:0000269|PubMed:19617556, ECO:0000269|PubMed:23201271, ECO:0000269|PubMed:23401859, ECO:0000269|PubMed:25349211, ECO:0000269|PubMed:26906416, ECO:0000269|PubMed:28581483}."],"url":"https:\/\/www.uniprot.org\/uniprot\/Q96GG9","_search_query":"MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"} +{"_doc_id":"doc-P68871","type":"protein","content":"Function: ['Involved in oxygen transport from the lung to the various peripheral tissues. {ECO:0000269|PubMed:28066926}.', 'LVV-hemorphin-7 potentiates the activity of bradykinin, causing a decrease in blood pressure.', '[Spinorphin]: Functions as an endogenous inhibitor of enkephalin-degrading enzymes such as DPP3, and as a selective antagonist of the P2RX3 receptor which is involved in pain signaling, these properties implicate it as a regulator of pain and inflammation.']\nSequence: MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P68871","entry_name":"HBB_HUMAN","gene_names":[{"Name":"HBB"}],"protein_name":"Hemoglobin subunit beta","organism":"Homo sapiens","sequence":"MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH","function":["Involved in oxygen transport from the lung to the various peripheral tissues. {ECO:0000269|PubMed:28066926}.","LVV-hemorphin-7 potentiates the activity of bradykinin, causing a decrease in blood pressure.","[Spinorphin]: Functions as an endogenous inhibitor of enkephalin-degrading enzymes such as DPP3, and as a selective antagonist of the P2RX3 receptor which is involved in pain signaling, these properties implicate it as a regulator of pain and inflammation."],"url":"https:\/\/www.uniprot.org\/uniprot\/P68871","_search_query":"P68871"} +{"_doc_id":"doc-P22939","type":"protein","content":"Sequence: MDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDADMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQSLKQLAEQSLDTSALEALADYIIQRNK","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P22939","entry_name":"ISPA_ECOLI","gene_names":[{"Name":"ispA","OrderedLocusNames":["b0421","JW0411"]}],"protein_name":"Farnesyl diphosphate synthase","organism":"Escherichia coli","sequence":"MDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDADMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQSLKQLAEQSLDTSALEALADYIIQRNK","function":[],"url":"https:\/\/www.uniprot.org\/uniprot\/P22939","_search_query":"MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} +{"_doc_id":"doc-Q8I8V0","type":"protein","content":"Function: ['Component of several Gcn5-containing histone acetyltransferase complexes that regulate nucleosome organization; involved in acetylation of histone H3, particularly on Lys-10 (H3K9ac) and Lys-15 (H3K14ac) (PubMed:12482983, PubMed:12697829, PubMed:15340070, PubMed:19740772, PubMed:22796493). Regulates the transcription of a subset of genes during development; affects recruitment of RNA polymerase II (PubMed:19740772, PubMed:23336284). May be involved in the function of some acidic activation domains, which activate transcription at distant sites (PubMed:12697829). Involved in the p53-dependent apoptosis pathway response to DNA damage by genotoxic agents (PubMed:15340070, PubMed:16135810). {ECO:0000269|PubMed:12482983, ECO:0000269|PubMed:12697829, ECO:0000269|PubMed:15340070, ECO:0000269|PubMed:16135810, ECO:0000269|PubMed:19740772, ECO:0000269|PubMed:22796493, ECO:0000269|PubMed:23336284}.', '[Isoform B]: Component of the SAGA histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}.', '[Isoform A]: Component of the CHAT histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}.']\nSequence: MTTIADLFTKYNCTNCQDDIQGIRVHCAECENFDLCLQCFAAGAEIGAHQNNHSYQFMDTGTSILSVFRGKGAWTAREEIRLLDAIEQYGFGNWEDISKHIETKSAEDAKEEYVNKFVNGTIGRATWTPAQSQRPRLIDHTGDDDAGPLGTNALSTLPPLEINSDEAMQLGYMPNRDSFEREYDPTAEQLISNISLSSEDTEVDVMLKLAHVDIYTRRLRERARRKRMVRDYQLVSNFFRNRNYAQQQGLTKEQREFRDRFRVYAQFYTCNEYERLLGSLEREKELRIRQSELYRYRYNGLTKIAECTHFEQHAATATHRSTGPYGHGKTDHTHTSNGSHRPPSSSLHSPQPNLRKVEMSSGGEASSNSIAPRNTLHIADPTCSGALLPSKNYLDSCRGSSAATMLQTTGMVMGVTVDSGATTGVTSTATTMANLPTNSAKGSQQHLQPLQQHPQLLQSGNQHKMQNEAAGGGSDQVPSMSLKLRTQLEELKHLPQPPGSELLSHNELDLCKKHNITPTTYLSVKTVCLSGAPSLGSPMETSLRKFFIKCGWLSH","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"Q8I8V0","entry_name":"TAD2B_DROME","gene_names":[{"Name":"Ada2b {ECO:0000312|FlyBase:FBgn0037555}","Synonyms":["Ada2S {ECO:0000303|PubMed:12697829}"],"ORFNames":["CG9638 {ECO:0000312|FlyBase:FBgn0037555}"]}],"protein_name":"Transcriptional adapter 2b {ECO:0000312|FlyBase:FBgn0037555}","organism":"Drosophila melanogaster","sequence":"MTTIADLFTKYNCTNCQDDIQGIRVHCAECENFDLCLQCFAAGAEIGAHQNNHSYQFMDTGTSILSVFRGKGAWTAREEIRLLDAIEQYGFGNWEDISKHIETKSAEDAKEEYVNKFVNGTIGRATWTPAQSQRPRLIDHTGDDDAGPLGTNALSTLPPLEINSDEAMQLGYMPNRDSFEREYDPTAEQLISNISLSSEDTEVDVMLKLAHVDIYTRRLRERARRKRMVRDYQLVSNFFRNRNYAQQQGLTKEQREFRDRFRVYAQFYTCNEYERLLGSLEREKELRIRQSELYRYRYNGLTKIAECTHFEQHAATATHRSTGPYGHGKTDHTHTSNGSHRPPSSSLHSPQPNLRKVEMSSGGEASSNSIAPRNTLHIADPTCSGALLPSKNYLDSCRGSSAATMLQTTGMVMGVTVDSGATTGVTSTATTMANLPTNSAKGSQQHLQPLQQHPQLLQSGNQHKMQNEAAGGGSDQVPSMSLKLRTQLEELKHLPQPPGSELLSHNELDLCKKHNITPTTYLSVKTVCLSGAPSLGSPMETSLRKFFIKCGWLSH","function":["Component of several Gcn5-containing histone acetyltransferase complexes that regulate nucleosome organization; involved in acetylation of histone H3, particularly on Lys-10 (H3K9ac) and Lys-15 (H3K14ac) (PubMed:12482983, PubMed:12697829, PubMed:15340070, PubMed:19740772, PubMed:22796493). Regulates the transcription of a subset of genes during development; affects recruitment of RNA polymerase II (PubMed:19740772, PubMed:23336284). May be involved in the function of some acidic activation domains, which activate transcription at distant sites (PubMed:12697829). Involved in the p53-dependent apoptosis pathway response to DNA damage by genotoxic agents (PubMed:15340070, PubMed:16135810). {ECO:0000269|PubMed:12482983, ECO:0000269|PubMed:12697829, ECO:0000269|PubMed:15340070, ECO:0000269|PubMed:16135810, ECO:0000269|PubMed:19740772, ECO:0000269|PubMed:22796493, ECO:0000269|PubMed:23336284}.","[Isoform B]: Component of the SAGA histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}.","[Isoform A]: Component of the CHAT histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}."],"url":"https:\/\/www.uniprot.org\/uniprot\/Q8I8V0","_search_query":"p53"} +{"_doc_id":"doc-P04637","type":"protein","content":"Function: ['Multifunctional transcription factor that induces cell cycle arrest, DNA repair or apoptosis upon binding to its target DNA sequence (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:35618207, PubMed:36634798, PubMed:38653238, PubMed:9840937). Acts as a tumor suppressor in many tumor types; induces growth arrest or apoptosis depending on the physiological circumstances and cell type (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17189187, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:38653238, PubMed:9840937). Negatively regulates cell division by controlling expression of a set of genes required for this process (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:9840937). One of the activated genes is an inhibitor of cyclin-dependent kinases. Apoptosis induction seems to be mediated either by stimulation of BAX and FAS antigen expression, or by repression of Bcl-2 expression (PubMed:12524540, PubMed:17189187). Its pro-apoptotic activity is activated via its interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 (PubMed:12524540). However, this activity is inhibited when the interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 is displaced by PPP1R13L\/iASPP (PubMed:12524540). In cooperation with mitochondrial PPIF is involved in activating oxidative stress-induced necrosis; the function is largely independent of transcription. Induces the transcription of long intergenic non-coding RNA p21 (lincRNA-p21) and lincRNA-Mkln1. LincRNA-p21 participates in TP53-dependent transcriptional repression leading to apoptosis and seems to have an effect on cell-cycle regulation. Implicated in Notch signaling cross-over. Prevents CDK7 kinase activity when associated to CAK complex in response to DNA damage, thus stopping cell cycle progression. Isoform 2 enhances the transactivation activity of isoform 1 from some but not all TP53-inducible promoters. Isoform 4 suppresses transactivation activity and impairs growth suppression mediated by isoform 1. Isoform 7 inhibits isoform 1-mediated apoptosis. Regulates the circadian clock by repressing CLOCK-BMAL1-mediated transcriptional activation of PER2 (PubMed:24051492). {ECO:0000269|PubMed:11025664, ECO:0000269|PubMed:12524540, ECO:0000269|PubMed:12810724, ECO:0000269|PubMed:15186775, ECO:0000269|PubMed:15340061, ECO:0000269|PubMed:17189187, ECO:0000269|PubMed:17317671, ECO:0000269|PubMed:17349958, ECO:0000269|PubMed:19556538, ECO:0000269|PubMed:20673990, ECO:0000269|PubMed:20959462, ECO:0000269|PubMed:22726440, ECO:0000269|PubMed:24051492, ECO:0000269|PubMed:24652652, ECO:0000269|PubMed:35618207, ECO:0000269|PubMed:36634798, ECO:0000269|PubMed:38653238, ECO:0000269|PubMed:9840937}.']\nSequence: MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P04637","entry_name":"P53_HUMAN","gene_names":[{"Name":"TP53","Synonyms":["P53"]}],"protein_name":"Cellular tumor antigen p53","organism":"Homo sapiens","sequence":"MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD","function":["Multifunctional transcription factor that induces cell cycle arrest, DNA repair or apoptosis upon binding to its target DNA sequence (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:35618207, PubMed:36634798, PubMed:38653238, PubMed:9840937). Acts as a tumor suppressor in many tumor types; induces growth arrest or apoptosis depending on the physiological circumstances and cell type (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17189187, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:38653238, PubMed:9840937). Negatively regulates cell division by controlling expression of a set of genes required for this process (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:9840937). One of the activated genes is an inhibitor of cyclin-dependent kinases. Apoptosis induction seems to be mediated either by stimulation of BAX and FAS antigen expression, or by repression of Bcl-2 expression (PubMed:12524540, PubMed:17189187). Its pro-apoptotic activity is activated via its interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 (PubMed:12524540). However, this activity is inhibited when the interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 is displaced by PPP1R13L\/iASPP (PubMed:12524540). In cooperation with mitochondrial PPIF is involved in activating oxidative stress-induced necrosis; the function is largely independent of transcription. Induces the transcription of long intergenic non-coding RNA p21 (lincRNA-p21) and lincRNA-Mkln1. LincRNA-p21 participates in TP53-dependent transcriptional repression leading to apoptosis and seems to have an effect on cell-cycle regulation. Implicated in Notch signaling cross-over. Prevents CDK7 kinase activity when associated to CAK complex in response to DNA damage, thus stopping cell cycle progression. Isoform 2 enhances the transactivation activity of isoform 1 from some but not all TP53-inducible promoters. Isoform 4 suppresses transactivation activity and impairs growth suppression mediated by isoform 1. Isoform 7 inhibits isoform 1-mediated apoptosis. Regulates the circadian clock by repressing CLOCK-BMAL1-mediated transcriptional activation of PER2 (PubMed:24051492). {ECO:0000269|PubMed:11025664, ECO:0000269|PubMed:12524540, ECO:0000269|PubMed:12810724, ECO:0000269|PubMed:15186775, ECO:0000269|PubMed:15340061, ECO:0000269|PubMed:17189187, ECO:0000269|PubMed:17317671, ECO:0000269|PubMed:17349958, ECO:0000269|PubMed:19556538, ECO:0000269|PubMed:20673990, ECO:0000269|PubMed:20959462, ECO:0000269|PubMed:22726440, ECO:0000269|PubMed:24051492, ECO:0000269|PubMed:24652652, ECO:0000269|PubMed:35618207, ECO:0000269|PubMed:36634798, ECO:0000269|PubMed:38653238, ECO:0000269|PubMed:9840937}."],"url":"https:\/\/www.uniprot.org\/uniprot\/P04637","_search_query":"P04637"} diff --git a/examples/input_examples/searched_rna_demo.jsonl b/examples/input_examples/searched_rna_demo.jsonl new file mode 100644 index 00000000..9ad088c0 --- /dev/null +++ b/examples/input_examples/searched_rna_demo.jsonl @@ -0,0 +1,6 @@ +{"_doc_id":"doc-URS0000123456","type":"rna","content":"Description: rRNA from 1 species\nSequence: CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000123456","rnacentral_id":"URS0000123456","sequence":"CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","sequence_length":282,"rna_type":"rRNA","description":"rRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000123456","organism":"uncultured Staphylococcus sp.","related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"URS0000123456"} +{"_doc_id":"doc-URS00000088CC","type":"rna","content":"Description: lncRNA from 1 species\nSequence: GCAGUUCUCAGCCAUGACAGAUGGGAGUUUCGGCCCAAUUGACCAGUAUUCCUUACUGAUAAGAGACACUGACCAUGGAGUGGUUCUGGUGAGAUGACAUGACCCUCGUGAAGGGGCCUGAAGCUUCAUUGUGUUUGUGUAUGUUUCUCUCUUCAAAAAUAUUCAUGACUUCUCCUGUAGCUUGAUAAAUAUGUAUAUUUACACACUGCA","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS00000088CC","rnacentral_id":"URS00000088CC","sequence":"GCAGUUCUCAGCCAUGACAGAUGGGAGUUUCGGCCCAAUUGACCAGUAUUCCUUACUGAUAAGAGACACUGACCAUGGAGUGGUUCUGGUGAGAUGACAUGACCCUCGUGAAGGGGCCUGAAGCUUCAUUGUGUUUGUGUAUGUUUCUCUCUUCAAAAAUAUUCAUGACUUCUCCUGUAGCUUGAUAAAUAUGUAUAUUUACACACUGCA","sequence_length":210,"rna_type":"lncRNA","description":"lncRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS00000088CC","organism":"Homo sapiens","related_genes":["ENSG00000265458.1","lnc-C17orf62-1","ENSG00000265458","NONHSAG023099","HSALNG0119438","NONHSAG023099.2","ENSG00000265458.4","RP13-20L14.6","NARF-AS2"],"gene_name":"ENSG00000265458, ENSG00000265458.1, ENSG00000265458.4, HSALNG0119438, NARF-AS2, NONHSAG023099, NONHSAG023099.2, RP13-20L14.6, lnc-C17orf62-1","so_term":"antisense, ncRNA","modifications":null,"_search_query":"GCAGTTCTCAGCCATGACAGATGGGAGTTTCGGCCCAATTGACCAGTATTCCTTACTGATAAGAGACACTGACCATGGAGTGGTTCTGGTGAGATGACATGACCCTCGTGAAGGGGCCTGAAGCTTCATTGTGTTTGTGTATGTTTCTCTCTTCAAAAATATTCATGACTTCTCCTGTAGCTTGATAAATATGTATATTTACACACTGCA"} +{"_doc_id":"doc-URS000342178E","type":"rna","content":"Description: None misc RNA\nSequence: GGUUUUCGUAUAUCCUUAAUGAUAUGGUUUAAGGGCAAUACAUAGAAACCACAAAUUUCUUACUGCGAAAAUC","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS000342178E","rnacentral_id":"URS000342178E","sequence":"GGUUUUCGUAUAUCCUUAAUGAUAUGGUUUAAGGGCAAUACAUAGAAACCACAAAUUUCUUACUGCGAAAAUC","sequence_length":73,"rna_type":"misc_RNA","description":"None misc RNA","url":"https:\/\/rnacentral.org\/rna\/URS000342178E","organism":null,"related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"XIST regulator"} +{"_doc_id":"doc-URS0000123456","type":"rna","content":"Description: rRNA from 1 species\nSequence: CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000123456","rnacentral_id":"URS0000123456","sequence":"CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","sequence_length":282,"rna_type":"rRNA","description":"rRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000123456","organism":"uncultured Staphylococcus sp.","related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} +{"_doc_id":"doc-URS0000000787","type":"rna","content":"Description: lncRNA from 1 species\nSequence: AGGGAUCUUCUGCCCUUGGUCCUAAGUGCCACUAUCUGUGCUGAGUUUUUCAAAGGUCAGAGCAGAUUGAACCAUUGUGGUUUCAUUUUCCCUGAUUUUGAUUUUUCUUAUGGGGAACCUGUGUGGCUGCAUUCAAGGUGACUCGAAGAAGCCUUCCAAAAAGCAUGUGAAAAGGAAGCCCUACUCUACUACCAAGGUGACUUCAGGGAGCACAUUCAAUGGUACGUAUUCUGGAAUCACUCACUGGUUGUUAGAAAAGGAUUCUACAGGAAAUCUGGAGCUUAACUGCUGGCUUUUGUCUGGAGAGCCUCCAUGAUCCAAGACAUCUGGUGGGAAUGAGGAUGUAGGGUAUAGUAAAAGAAACUGGUUUUCCUGGUGACAUACUCUUUUUAUCUAUGUAUAGUUUCUGGGAACAUGUUCACAUUAGGUUGUGUGUGGGUAUGUGUGUAUUAGGGCGGGGGUGGGGUGAGGUGGUCUGUGUGCAAGUCUGCAUGAUUUGCUUGUGAAUGUGUGUCUAUGUGUGUUUCCCCUAGGAAAAAAAUGUUGUGUUUACCCAGCACAACUCUCAGUGCCAUU","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000000787","rnacentral_id":"URS0000000787","sequence":"AGGGAUCUUCUGCCCUUGGUCCUAAGUGCCACUAUCUGUGCUGAGUUUUUCAAAGGUCAGAGCAGAUUGAACCAUUGUGGUUUCAUUUUCCCUGAUUUUGAUUUUUCUUAUGGGGAACCUGUGUGGCUGCAUUCAAGGUGACUCGAAGAAGCCUUCCAAAAAGCAUGUGAAAAGGAAGCCCUACUCUACUACCAAGGUGACUUCAGGGAGCACAUUCAAUGGUACGUAUUCUGGAAUCACUCACUGGUUGUUAGAAAAGGAUUCUACAGGAAAUCUGGAGCUUAACUGCUGGCUUUUGUCUGGAGAGCCUCCAUGAUCCAAGACAUCUGGUGGGAAUGAGGAUGUAGGGUAUAGUAAAAGAAACUGGUUUUCCUGGUGACAUACUCUUUUUAUCUAUGUAUAGUUUCUGGGAACAUGUUCACAUUAGGUUGUGUGUGGGUAUGUGUGUAUUAGGGCGGGGGUGGGGUGAGGUGGUCUGUGUGCAAGUCUGCAUGAUUUGCUUGUGAAUGUGUGUCUAUGUGUGUUUCCCCUAGGAAAAAAAUGUUGUGUUUACCCAGCACAACUCUCAGUGCCAUU","sequence_length":576,"rna_type":"lncRNA","description":"lncRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000000787","organism":"Homo sapiens","related_genes":["KB-1183D5.13","lnc-GGT2-26","ENSG00000206142.10","ENSG00000206142.9","NONHSAG033362.2","FAM230H","NONHSAG033362","lnc-GGT2-4","ENSG00000206142","lnc-GGT2-2","HSALNG0134219"],"gene_name":"ENSG00000206142, ENSG00000206142.10, ENSG00000206142.9, FAM230H, HSALNG0134219, KB-1183D5.13, NONHSAG033362, NONHSAG033362.2, lnc-GGT2-2, lnc-GGT2-26, lnc-GGT2-4","so_term":"lincRNA, ncRNA","modifications":null,"_search_query":"URS0000000787"} +{"_doc_id":"doc-URS0000000001","type":"rna","content":"Description: rRNA from 1 species\nSequence: AUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGUAGAGAGAAGCUUGCUUCUCUUGAGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGAUAACGCUCGGAAACGGACGCUAAUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGC","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000000001","rnacentral_id":"URS0000000001","sequence":"AUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGUAGAGAGAAGCUUGCUUCUCUUGAGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGAUAACGCUCGGAAACGGACGCUAAUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGC","sequence_length":200,"rna_type":"rRNA","description":"rRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000000001","organism":"uncultured bacterium","related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"URS0000000001"} diff --git a/graphgen/bases/base_operator.py b/graphgen/bases/base_operator.py index 9ccb8bf2..8d0cddb5 100644 --- a/graphgen/bases/base_operator.py +++ b/graphgen/bases/base_operator.py @@ -1,4 +1,12 @@ import inspect +import logging +import os +from abc import ABC, abstractmethod +from typing import Iterable, Union + +import pandas as pd +import ray + from graphgen.utils.log import CURRENT_LOGGER_VAR, set_logger diff --git a/graphgen/bases/base_reader.py b/graphgen/bases/base_reader.py index ff9ffb18..ba72f410 100644 --- a/graphgen/bases/base_reader.py +++ b/graphgen/bases/base_reader.py @@ -39,7 +39,8 @@ def _should_keep_item(self, item: Dict[str, Any]) -> bool: "table", "equation", "protein", - + "dna", + "rna", ], f"Unsupported item type: {item_type}" if item_type == "text": content = item.get(self.text_column, "").strip() diff --git a/graphgen/bases/base_searcher.py b/graphgen/bases/base_searcher.py index 42a26681..cba0315e 100644 --- a/graphgen/bases/base_searcher.py +++ b/graphgen/bases/base_searcher.py @@ -11,6 +11,36 @@ class BaseSearcher(ABC): Abstract base class for searching and retrieving data. """ + def __init__(self, working_dir: str = "cache"): + """ + Initialize the base searcher with a logger. + + :param working_dir: Working directory for log files. + """ + log_dir = os.path.join(working_dir, "logs") + searcher_name = self.__class__.__name__ + + # e.g. cache/logs/NCBISearch.log + log_file = os.path.join(log_dir, f"{searcher_name}.log") + + self.logger = set_logger( + log_file=log_file, name=searcher_name, + console_level=logging.ERROR, force=True + ) + + self.logger.info( + "[%s] Searcher initialized", searcher_name + ) + + @abstractmethod + async def search(self, query: str, **kwargs) -> List[Dict[str, Any]]: + """ + Search for data based on the given query. + + :param query: The searcher query. + :param kwargs: Additional keyword arguments for the searcher. + :return: List of dictionaries containing the searcher results. + """ def get_logger(self): """Get the logger instance.""" diff --git a/graphgen/engine.py b/graphgen/engine.py index cc4c4570..6d6961d5 100644 --- a/graphgen/engine.py +++ b/graphgen/engine.py @@ -1,16 +1,37 @@ import inspect import logging +import os +from collections import defaultdict, deque +from functools import wraps +from typing import Any, Callable, Dict, List, Set + +import ray +import ray.data +from ray.data import DataContext + +from graphgen.bases import Config, Node +from graphgen.utils import logger + + +class Engine: + def __init__( + self, config: Dict[str, Any], functions: Dict[str, Callable], **ray_init_kwargs + ): + self.config = Config(**config) + self.global_params = self.config.global_params + self.functions = functions + self.datasets: Dict[str, ray.data.Dataset] = {} + # Disable Ray Data progress bars and verbose output os.environ.setdefault("RAY_DATA_DISABLE_PROGRESS_BARS", "1") # Disable metrics exporter to avoid RpcError os.environ.setdefault("RAY_DISABLE_IMPORTANT_WARNING", "1") - try: - from ray.data import DataContext - ctx = DataContext.get_current() - ctx.enable_rich_progress_bars = False - ctx.use_ray_tqdm = False - except Exception: - pass # Ray Data context might not be available + ctx = DataContext.get_current() + ctx.enable_rich_progress_bars = False + ctx.use_ray_tqdm = False + # Disable tensor extension casting to avoid conversion errors with complex types + # (e.g., gene_synonyms, gene_names which are lists/arrays) + ctx.enable_tensor_extension_casting = False if not ray.is_initialized(): # Disable metrics exporter to avoid RpcError @@ -18,7 +39,7 @@ context = ray.init( ignore_reinit_error=True, logging_level=logging.ERROR, - log_to_driver=False, # Disable Ray logs to driver + log_to_driver=True, **ray_init_kwargs, ) logger.info("Ray Dashboard URL: %s", context.dashboard_url) diff --git a/graphgen/models/generator/__init__.py b/graphgen/models/generator/__init__.py index 12740f3b..ec41f5dc 100644 --- a/graphgen/models/generator/__init__.py +++ b/graphgen/models/generator/__init__.py @@ -2,6 +2,6 @@ from .atomic_generator import AtomicGenerator from .cot_generator import CoTGenerator from .multi_hop_generator import MultiHopGenerator - +from .omics_qa_generator import OmicsQAGenerator from .quiz_generator import QuizGenerator from .vqa_generator import VQAGenerator diff --git a/graphgen/models/generator/omics_qa_generator.py b/graphgen/models/generator/omics_qa_generator.py new file mode 100644 index 00000000..d5ac7ddb --- /dev/null +++ b/graphgen/models/generator/omics_qa_generator.py @@ -0,0 +1,365 @@ +import re +from typing import Any, Optional + +from graphgen.bases import BaseGenerator +from graphgen.templates import OMICS_QA_GENERATION_PROMPT +from graphgen.utils import compute_content_hash, detect_main_language, logger + + +class OmicsQAGenerator(BaseGenerator): + """ + Unified QA generator for multi-omics data (DNA, RNA, Protein). + Automatically extracts and attaches molecule-specific caption information to QA pairs. + """ + + @staticmethod + def build_prompt( + batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] + ) -> str: + nodes, edges = batch + entities_str = "\n".join( + [ + f"{index + 1}. {node[0]}: {node[1]['description']}" + for index, node in enumerate(nodes) + ] + ) + + relationships_str = "\n".join( + [ + f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" + for index, edge in enumerate(edges) + ] + ) + language = detect_main_language(entities_str + relationships_str) + prompt = OMICS_QA_GENERATION_PROMPT[language].format( + entities=entities_str, relationships=relationships_str + ) + return prompt + + @staticmethod + def parse_response(response: str) -> Any: + """ + Parse the LLM response and return the generated QAs + :param response + :return: QA pairs + """ + qa_pairs = {} + qa_list = response.strip().split("\n\n") + for qa in qa_list: + match = re.search( + r"Question:\s*(.*?)\s*Answer:\s*(.*)", qa, re.DOTALL + ) or re.search(r"问题:\s*(.*?)\s*答案:\s*(.*)", qa, re.DOTALL) + + if match: + question = match.group(1).strip() + answer = match.group(2).strip() + else: + if qa: + logger.error("Failed to parse QA pair: %s", qa) + continue + question = question.strip('"') + answer = answer.strip('"') + logger.debug("Question: %s", question) + logger.debug("Answer: %s", answer) + qa_pairs[compute_content_hash(question)] = { + "question": question, + "answer": answer, + } + return qa_pairs + + @staticmethod + def _extract_caption(node_data: dict, molecule_type: str) -> Optional[dict]: + """ + Extract molecule-specific caption information from node data. + + :param node_data: Node data dictionary + :param molecule_type: Type of molecule ("dna", "rna", or "protein") + :return: Caption dictionary or None + """ + molecule_type_lower = molecule_type.lower() + + # Check if there's already a caption field (e.g., protein_caption, dna_caption, rna_caption) + caption_key = f"{molecule_type_lower}_caption" + if caption_key in node_data and node_data[caption_key]: + if isinstance(node_data[caption_key], list) and len(node_data[caption_key]) > 0: + # Always return the first element if it's a dict, otherwise return None for consistency + caption_val = node_data[caption_key][0] + return caption_val if isinstance(caption_val, dict) else None + if isinstance(node_data[caption_key], dict): + return node_data[caption_key] + + # Field mappings for each molecule type + field_mapping = { + "protein": [ + "protein_name", "gene_names", "organism", "function", + "sequence", "id", "database", "entry_name", "uniprot_id" + ], + "dna": [ + "gene_name", "gene_description", "organism", "chromosome", + "genomic_location", "function", "gene_type", "id", + "database", "sequence" + ], + "rna": [ + "rna_type", "description", "organism", "related_genes", + "gene_name", "so_term", "id", "database", + "rnacentral_id", "sequence" + ], + } + + # Extract fields based on molecule type + caption = {} + caption_fields = field_mapping.get(molecule_type_lower, []) + for field in caption_fields: + if field in node_data and node_data[field]: + caption[field] = node_data[field] + + # Special handling for protein: check search results and existing protein field + if molecule_type_lower == "protein": + # Check for search result data (from UniProt search) + if "_search_results" in node_data: + search_results = node_data["_search_results"] + if isinstance(search_results, list) and len(search_results) > 0: + first_result = search_results[0] + if isinstance(first_result, dict): + search_caption = { + "id": first_result.get("id", ""), + "protein_name": first_result.get("protein_name", ""), + "gene_names": first_result.get("gene_names", []), + "organism": first_result.get("organism", ""), + "function": first_result.get("function", []), + "sequence": node_data.get("sequence") or first_result.get("sequence", ""), + "database": "UniProt" + } + # Remove empty fields and return if any data exists + search_caption = {k: v for k, v in search_caption.items() if v} + if search_caption: + return search_caption + + # Merge with existing protein field if present + if "protein" in node_data and node_data["protein"]: + existing_protein = node_data["protein"] + if isinstance(existing_protein, list) and len(existing_protein) > 0: + existing_protein = ( + existing_protein[0] + if isinstance(existing_protein[0], dict) + else existing_protein + ) + if isinstance(existing_protein, dict): + for key, value in existing_protein.items(): + if key not in caption and value: + caption[key] = value + # Ensure sequence from node_data takes precedence + if "sequence" in node_data and node_data["sequence"]: + caption["sequence"] = node_data["sequence"] + + # Fallback to description if no caption found + if not caption and "description" in node_data: + description = node_data["description"] + if isinstance(description, str) and len(description) > 10: + caption["description"] = description + + return caption if caption else None + + @staticmethod + def _detect_molecule_type(nodes: list[tuple[str, dict]]) -> str: + """ + Detect molecule type from nodes. + Priority: Check node type, then check metadata, then check node data fields. + + :param nodes: List of (node_id, node_data) tuples + :return: Detected molecule type ("dna", "rna", "protein", or "unknown") + """ + if not nodes: + return "unknown" + + # Type indicators for each molecule type + type_indicators = { + "protein": { + "fields": ["protein_name", "uniprot_id", "entry_name", "protein_caption"], + "source_prefix": "protein-", + "description_keywords": ["protein"], + }, + "dna": { + "fields": ["gene_name", "chromosome", "genomic_location"], + "source_prefix": "dna-", + "description_keywords": ["gene", "dna", "chromosome"], + }, + "rna": { + "fields": ["rna_type", "rnacentral_id"], + "source_prefix": "rna-", + "description_keywords": ["rna", "transcript"], + }, + } + + for _, node_data in nodes: + # Priority 1: Check explicit type fields (most reliable) + for field in ["type", "molecule_type"]: + value = node_data.get(field, "").lower() + if value in ("dna", "rna", "protein"): + return value + + # Priority 2: Check source_id prefix + source_id = node_data.get("source_id", "").lower() + for mol_type, indicators in type_indicators.items(): + if source_id.startswith(indicators["source_prefix"]): + return mol_type + + # Priority 3: Check type-specific fields + for mol_type, indicators in type_indicators.items(): + if any(key in node_data for key in indicators["fields"]): + # Special check for DNA: need chromosome or genomic_location + if mol_type == "dna" and not any(key in node_data for key in ["chromosome", "genomic_location"]): + continue + return mol_type + + # Priority 4: Check description keywords + description = node_data.get("description", "").lower() + for mol_type, indicators in type_indicators.items(): + keywords = indicators["description_keywords"] + if any(kw in description for kw in keywords): + # Special check: "protein" in description but not "gene" + if mol_type == "protein" and "gene" in description: + continue + return mol_type + + return "unknown" + + async def generate( + self, + batch: tuple[ + list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]] + ], + ) -> dict[str, Any]: + """ + Generate QAs based on a given batch. + Automatically extracts and attaches molecule-specific caption information. + + :param batch + :return: QA pairs with attached molecule captions + """ + result = {} + prompt = self.build_prompt(batch) + response = await self.llm_client.generate_answer(prompt) + qa_pairs = self.parse_response(response) # generate one or more QA pairs + + nodes, _ = batch + + # Detect molecule type from nodes + molecule_type = self._detect_molecule_type(nodes) + + # Extract captions for all molecule types from nodes + captions = {"dna": None, "rna": None, "protein": None} + caption_attached = False + + for node in nodes: + _, node_data = node + + # Check for pre-extracted captions (from partition_service) + for mol_type in ["dna", "rna", "protein"]: + caption_key = f"{mol_type}_caption" + if caption_key in node_data and node_data[caption_key]: + captions[mol_type] = node_data[caption_key] + caption_attached = True + + # If no pre-extracted captions, extract from node_data using the detected molecule_type + if not caption_attached: + caption = self._extract_caption(node_data, molecule_type) + if caption: + captions[molecule_type] = caption + caption_attached = True + break # Only need to extract once per batch + + # Attach all captions to QA pairs + for qa in qa_pairs.values(): + qa["dna"] = captions["dna"] if captions["dna"] else "" + qa["rna"] = captions["rna"] if captions["rna"] else "" + qa["protein"] = captions["protein"] if captions["protein"] else "" + + if not caption_attached: + node_sample = ( + dict(list(nodes[0][1].items())[:5]) if nodes else 'No nodes' + ) + logger.warning( + "No caption extracted for molecule_type=%s. Node data sample: %s", + molecule_type, node_sample + ) + + result.update(qa_pairs) + return result + + @staticmethod + def format_generation_results( + results: list[dict], output_data_format: str + ) -> list[dict[str, Any]]: + """ + Format generation results with molecule-specific caption fields. + Supports dna, rna, and protein fields in output. + """ + # Extract QA pairs and molecule captions + qa_items = [ + { + "question": v["question"], + "answer": v["answer"], + "dna": v.get("dna", ""), + "rna": v.get("rna", ""), + "protein": v.get("protein", ""), + } + for item in results + for k, v in item.items() + ] + + # Format based on output format + if output_data_format == "Alpaca": + return [ + { + "instruction": qa["question"], + "input": "", + "output": qa["answer"], + "dna": qa["dna"], + "rna": qa["rna"], + "protein": qa["protein"], + } + for qa in qa_items + ] + if output_data_format == "Sharegpt": + return [ + { + "conversations": [ + { + "from": "human", + "value": [ + { + "text": qa["question"], + "dna": qa["dna"], + "rna": qa["rna"], + "protein": qa["protein"], + } + ], + }, + {"from": "gpt", "value": qa["answer"]}, + ] + } + for qa in qa_items + ] + if output_data_format == "ChatML": + return [ + { + "messages": [ + { + "role": "user", + "content": [ + { + "text": qa["question"], + "dna": qa["dna"], + "rna": qa["rna"], + "protein": qa["protein"], + } + ], + }, + {"role": "assistant", "content": qa["answer"]}, + ] + } + for qa in qa_items + ] + else: + raise ValueError(f"Unknown output data format: {output_data_format}") diff --git a/graphgen/models/kg_builder/__init__.py b/graphgen/models/kg_builder/__init__.py index cc8ff877..3dc4ae95 100644 --- a/graphgen/models/kg_builder/__init__.py +++ b/graphgen/models/kg_builder/__init__.py @@ -1,3 +1,3 @@ from .light_rag_kg_builder import LightRAGKGBuilder from .mm_kg_builder import MMKGBuilder - +from .omics_kg_builder import OmicsKGBuilder \ No newline at end of file diff --git a/graphgen/models/kg_builder/omics_kg_builder.py b/graphgen/models/kg_builder/omics_kg_builder.py new file mode 100644 index 00000000..c5c92a94 --- /dev/null +++ b/graphgen/models/kg_builder/omics_kg_builder.py @@ -0,0 +1,291 @@ +import re +from collections import Counter, defaultdict +from typing import Dict, List, Tuple + +import numpy as np + +from graphgen.bases import BaseGraphStorage, BaseKGBuilder, BaseLLMWrapper, Chunk +from graphgen.templates import KG_SUMMARIZATION_PROMPT, OMICS_KG_EXTRACTION_PROMPT +from graphgen.utils import ( + detect_main_language, + handle_single_entity_extraction, + handle_single_relationship_extraction, + logger, + pack_history_conversations, + split_string_by_multi_markers, +) + + +class OmicsKGBuilder(BaseKGBuilder): + """ + Knowledge graph builder for multi-omics data (DNA, RNA, protein). + Extracts entities and relationships from sequence chunks and their metadata. + """ + + def __init__(self, llm_client: BaseLLMWrapper, max_loop: int = 3): + super().__init__(llm_client) + self.max_loop = max_loop + + async def extract( + self, chunk: Chunk + ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]: + """ + Extract entities and relationships from a sequence chunk using the LLM client. + + :param chunk: Sequence chunk with metadata + :return: (nodes_data, edges_data) + """ + chunk_id = chunk.id + content = chunk.content + metadata = chunk.metadata + + # Extract sequence and metadata information + sequence_chunk = content or metadata.get("sequence", "") + # molecule_type is used in _format_metadata indirectly via metadata dict + _ = metadata.get("molecule_type", "").lower() + + # Build metadata text for prompt + metadata_text = self._format_metadata(metadata) + + # Detect language from metadata text (defaults to English if no Chinese detected) + language = detect_main_language(metadata_text) + + # Build prompt with sequence and metadata + hint_prompt = OMICS_KG_EXTRACTION_PROMPT[language]["TEMPLATE"].format( + **OMICS_KG_EXTRACTION_PROMPT["FORMAT"], + metadata_text=metadata_text, + sequence_chunk=sequence_chunk[:500] if sequence_chunk else "", # Limit sequence length in prompt + ) + + # step 2: initial glean + final_result = await self.llm_client.generate_answer(hint_prompt) + logger.debug("First extraction result: %s", final_result) + + # step 3: iterative refinement + history = pack_history_conversations(hint_prompt, final_result) + for loop_idx in range(self.max_loop): + if_loop_result = await self.llm_client.generate_answer( + text=OMICS_KG_EXTRACTION_PROMPT[language]["IF_LOOP"], history=history + ) + if_loop_result = if_loop_result.strip().strip('"').strip("'").lower() + if if_loop_result != "yes": + break + + glean_result = await self.llm_client.generate_answer( + text=OMICS_KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history + ) + logger.debug("Loop %s glean: %s", loop_idx + 1, glean_result) + + history += pack_history_conversations( + OMICS_KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result + ) + final_result += glean_result + + # step 4: parse the final result + records = split_string_by_multi_markers( + final_result, + [ + OMICS_KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"], + OMICS_KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"], + ], + ) + + nodes = defaultdict(list) + edges = defaultdict(list) + + for record in records: + match = re.search(r"\((.*)\)", record) + if not match: + continue + inner = match.group(1) + + attributes = split_string_by_multi_markers( + inner, [OMICS_KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]] + ) + + entity = await handle_single_entity_extraction(attributes, chunk_id) + if entity is not None: + nodes[entity["entity_name"]].append(entity) + continue + + relation = await handle_single_relationship_extraction(attributes, chunk_id) + if relation is not None: + key = (relation["src_id"], relation["tgt_id"]) + edges[key].append(relation) + + return dict(nodes), dict(edges) + + @staticmethod + def _format_metadata(metadata: dict) -> str: + """ + Format metadata dictionary into a readable text string for the prompt. + + :param metadata: Metadata dictionary from chunk + :return: Formatted metadata text + """ + # Filter out internal fields and format + exclude_fields = { + "_doc_id", + "_chunk_id", + "chunk_index", + "total_chunks", + "length", + "type", + "content", + "sequence", + } + + metadata_items = [] + for key, value in metadata.items(): + if key in exclude_fields: + continue + # Handle numpy arrays and other array-like objects + if isinstance(value, np.ndarray): + if value.size == 0: + continue + # Convert numpy array to list for processing + value = value.tolist() + if value is None: + continue + # Check for empty string (but not numpy arrays which we already handled) + if isinstance(value, str) and value == "": + continue + if isinstance(value, list): + value = ", ".join(str(v) for v in value) + metadata_items.append(f"{key}: {value}") + + return "\n".join(metadata_items) if metadata_items else "No additional metadata available." + + async def merge_nodes( + self, + node_data: tuple[str, List[dict]], + kg_instance: BaseGraphStorage, + ) -> None: + """Merge extracted nodes into the knowledge graph.""" + entity_name, node_data = node_data + entity_types = [] + source_ids = [] + descriptions = [] + + node = kg_instance.get_node(entity_name) + if node is not None: + entity_types.append(node["entity_type"]) + source_ids.extend( + split_string_by_multi_markers(node["source_id"], [""]) + ) + descriptions.append(node["description"]) + + # Take the most frequent entity_type + entity_type = sorted( + Counter([dp["entity_type"] for dp in node_data] + entity_types).items(), + key=lambda x: x[1], + reverse=True, + )[0][0] + + description = "".join( + sorted(set([dp["description"] for dp in node_data] + descriptions)) + ) + description = await self._handle_kg_summary(entity_name, description) + + source_id = "".join( + set([dp["source_id"] for dp in node_data] + source_ids) + ) + + node_data_dict = { + "entity_type": entity_type, + "description": description, + "source_id": source_id, + } + + # Preserve sequence from existing node if present (e.g., added by partition_service) + if node is not None and "sequence" in node and node["sequence"]: + node_data_dict["sequence"] = node["sequence"] + + kg_instance.upsert_node(entity_name, node_data=node_data_dict) + + async def merge_edges( + self, + edges_data: tuple[Tuple[str, str], List[dict]], + kg_instance: BaseGraphStorage, + ) -> None: + """Merge extracted edges into the knowledge graph.""" + (src_id, tgt_id), edge_data = edges_data + + # Skip self-loops (edges where source and target are the same) + # This can happen when LLM extracts invalid relationships + if src_id == tgt_id: + logger.debug("Skipping self-loop edge: (%s, %s)", src_id, tgt_id) + return + + source_ids = [] + descriptions = [] + + edge = kg_instance.get_edge(src_id, tgt_id) + if edge is not None: + source_ids.extend( + split_string_by_multi_markers(edge["source_id"], [""]) + ) + descriptions.append(edge["description"]) + + description = "".join( + sorted(set([dp["description"] for dp in edge_data] + descriptions)) + ) + source_id = "".join( + set([dp["source_id"] for dp in edge_data] + source_ids) + ) + + for insert_id in [src_id, tgt_id]: + if not kg_instance.has_node(insert_id): + kg_instance.upsert_node( + insert_id, + node_data={ + "source_id": source_id, + "description": description, + "entity_type": "UNKNOWN", + }, + ) + + description = await self._handle_kg_summary( + f"({src_id}, {tgt_id})", description + ) + + kg_instance.upsert_edge( + src_id, + tgt_id, + edge_data={"source_id": source_id, "description": description}, + ) + + async def _handle_kg_summary( + self, + entity_or_relation_name: str, + description: str, + max_summary_tokens: int = 200, + ) -> str: + """ + Handle knowledge graph summary for omics entities/relations. + + :param entity_or_relation_name + :param description + :param max_summary_tokens + :return summary + """ + tokenizer_instance = self.llm_client.tokenizer + language = detect_main_language(description) + + tokens = tokenizer_instance.encode(description) + if len(tokens) < max_summary_tokens: + return description + + use_description = tokenizer_instance.decode(tokens[:max_summary_tokens]) + prompt = KG_SUMMARIZATION_PROMPT[language]["TEMPLATE"].format( + entity_name=entity_or_relation_name, + description_list=use_description.split(""), + **KG_SUMMARIZATION_PROMPT["FORMAT"], + ) + new_description = await self.llm_client.generate_answer(prompt) + logger.info( + "Entity or relation %s summary: %s", + entity_or_relation_name, + new_description, + ) + return new_description diff --git a/graphgen/models/partitioner/anchor_bfs_partitioner.py b/graphgen/models/partitioner/anchor_bfs_partitioner.py index 50e607ee..1e5e1903 100644 --- a/graphgen/models/partitioner/anchor_bfs_partitioner.py +++ b/graphgen/models/partitioner/anchor_bfs_partitioner.py @@ -18,6 +18,10 @@ class AnchorBFSPartitioner(BFSPartitioner): 2. Expand the community using BFS until the max unit size is reached.(A unit is a node or an edge.) 3. Non-anchor units can only be "pulled" into a community and never become seeds themselves. For example, for VQA tasks, we may want to use image nodes as anchors and expand to nearby text nodes and edges. + """ + + def __init__( + self, anchor_type: Union[ Literal["image", "dna", "rna", "protein"], List[Literal["dna", "rna", "protein"]], diff --git a/graphgen/models/reader/json_reader.py b/graphgen/models/reader/json_reader.py index c09453c7..a25abee8 100644 --- a/graphgen/models/reader/json_reader.py +++ b/graphgen/models/reader/json_reader.py @@ -6,6 +6,44 @@ import ray.data from graphgen.bases.base_reader import BaseReader +from graphgen.utils import logger + + +class JSONReader(BaseReader): + """ + Reader for JSON and JSONL files. + Columns: + - type: The type of the document (e.g., "text", "image", etc.) + - if type is "text", "content" column must be present. + """ + + def read(self, input_path: Union[str, List[str]]) -> ray.data.Dataset: + """ + Read JSON file and return Ray Dataset. + :param input_path: Path to JSON/JSONL file or list of JSON/JSONL files. + :return: Ray Dataset containing validated and filtered data. + """ + if self.modalities and len(self.modalities) >= 2: + ds: ray.data.Dataset = ray.data.from_items([]) + for file in input_path if isinstance(input_path, list) else [input_path]: + data = [] + if file.endswith(".jsonl"): + with open(file, "r", encoding="utf-8") as f: + for line in f: + item = json.loads(line) + data.append(item) + else: + with open(file, "r", encoding="utf-8") as f: + data = json.load(f) + data = self._unify_schema(data) + file_ds: ray.data.Dataset = ray.data.from_items(data) + ds = ds.union(file_ds) # type: ignore + else: + ds = ray.data.read_json(input_path) + ds = ds.map_batches(self._validate_batch, batch_format="pandas") + ds = ds.filter(self._should_keep_item) + return ds + def read_stream(self, file_path: str) -> Iterator[Dict[str, Any]]: """ Stream read JSONL files line by line without loading entire file into memory. diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index 4ddc8138..efaacf4b 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -23,7 +23,149 @@ @lru_cache(maxsize=None) def _get_pool(): + return ThreadPoolExecutor(max_workers=20) # NOTE:can increase for better parallelism + + +# ensure only one NCBI request at a time +_blast_lock = asyncio.Lock() + + +class NCBISearch(BaseSearcher): + """ + NCBI Search client to search DNA/GenBank/Entrez databases. + 1) Get the gene/DNA by accession number or gene ID. + 2) Search with keywords or gene names (fuzzy search). + 3) Search with FASTA sequence (BLAST search for DNA sequences). + + API Documentation: https://www.ncbi.nlm.nih.gov/home/develop/api/ + Note: NCBI has rate limits (max 3 requests per second), delays are required between requests. + """ + + def __init__( + self, + use_local_blast: bool = False, + local_blast_db: str = "nt_db", + email: str = "email@example.com", + api_key: str = "", + tool: str = "GraphGen", + blast_num_threads: int = 4, + working_dir: str = "cache", + ): + """ + Initialize the NCBI Search client. + + Args: + use_local_blast (bool): Whether to use local BLAST database. + local_blast_db (str): Path to the local BLAST database. + email (str): Email address for NCBI API requests. + api_key (str): API key for NCBI API requests, see https://account.ncbi.nlm.nih.gov/settings/. + tool (str): Tool name for NCBI API requests. + blast_num_threads (int): Number of threads for BLAST search. + working_dir (str): Working directory for log files. + """ + super().__init__(working_dir=working_dir) + Entrez.timeout = 60 # 60 seconds timeout + Entrez.email = email + Entrez.tool = tool + if api_key: + Entrez.api_key = api_key + Entrez.max_tries = 10 if api_key else 3 + Entrez.sleep_between_tries = 5 + self.use_local_blast = use_local_blast + self.local_blast_db = local_blast_db + self.blast_num_threads = blast_num_threads + if self.use_local_blast: + # Check for single-file database (.nhr) or multi-file database (.00.nhr) + db_exists = ( + os.path.isfile(f"{self.local_blast_db}.nhr") or + os.path.isfile(f"{self.local_blast_db}.00.nhr") + ) + if not db_exists: + self.logger.error("Local BLAST database files not found. Please check the path.") + self.logger.error("Expected: %s.nhr or %s.00.nhr", self.local_blast_db, self.local_blast_db) + self.use_local_blast = False + + @staticmethod + def _nested_get(data: dict, *keys, default=None): + """Safely traverse nested dictionaries.""" + for key in keys: + if not isinstance(data, dict): + return default + data = data.get(key, default) + return data + + @staticmethod + def _infer_molecule_type_detail(accession: Optional[str], gene_type: Optional[int] = None) -> Optional[str]: + """Infer molecule_type_detail from accession prefix or gene type.""" + if accession: + # Map accession prefixes to molecule types + prefix_map = { + ("NM_", "XM_"): "mRNA", + ("NC_", "NT_"): "genomic DNA", + ("NR_", "XR_"): "RNA", + ("NG_",): "genomic region", + } + for prefixes, mol_type in prefix_map.items(): + if accession.startswith(prefixes): + return mol_type + # Fallback: infer from gene type if available + if gene_type is not None: + gene_type_map = { + 3: "rRNA", + 4: "tRNA", + 5: "snRNA", + 6: "ncRNA", + } + return gene_type_map.get(gene_type) + return None + def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: + """ + Convert an Entrez gene record to a dictionary. + All extraction logic is inlined for maximum clarity and performance. + """ + if not gene_record: + raise ValueError("Empty gene record") + + data = gene_record[0] + locus = (data.get("Entrezgene_locus") or [{}])[0] + + # Extract common nested paths once + gene_ref = self._nested_get(data, "Entrezgene_gene", "Gene-ref", default={}) + biosource = self._nested_get(data, "Entrezgene_source", "BioSource", default={}) + + # Process synonyms + synonyms_raw = gene_ref.get("Gene-ref_syn", []) + gene_synonyms = [] + if isinstance(synonyms_raw, list): + for syn in synonyms_raw: + gene_synonyms.append(syn.get("Gene-ref_syn_E") if isinstance(syn, dict) else str(syn)) + elif synonyms_raw: + gene_synonyms.append(str(synonyms_raw)) + + # Extract location info + label = locus.get("Gene-commentary_label", "") + chromosome_match = re.search(r"Chromosome\s+(\S+)", str(label)) if label else None + + seq_interval = self._nested_get( + locus, "Gene-commentary_seqs", 0, "Seq-loc_int", "Seq-interval", default={} + ) + genomic_location = ( + f"{seq_interval.get('Seq-interval_from')}-{seq_interval.get('Seq-interval_to')}" + if seq_interval.get('Seq-interval_from') and seq_interval.get('Seq-interval_to') + else None + ) + + # Extract representative accession (prefer type 3 = mRNA/transcript) + representative_accession = next( + ( + product.get("Gene-commentary_accession") + for product in locus.get("Gene-commentary_products", []) + if product.get("Gene-commentary_type") == "3" + ), + None, + ) + # Fallback: if no type 3 accession, try any available accession if not representative_accession: representative_accession = next( ( diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index b59d9f39..d0a27efe 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -151,6 +151,16 @@ def _calculate_md5(sequence: str) -> str: return hashlib.md5(normalized_seq.encode("ascii")).hexdigest() + def get_by_rna_id(self, rna_id: str) -> Optional[dict]: + """ + Get RNA information by RNAcentral ID. + :param rna_id: RNAcentral ID (e.g., URS0000000001). + :return: A dictionary containing RNA information or None if not found. + """ + try: + url = f"{self.base_url}/rna/{rna_id}" + url += "?flat=true" + resp = requests.get(url, headers=self.headers, timeout=self.api_timeout) resp.raise_for_status() diff --git a/graphgen/models/searcher/db/uniprot_searcher.py b/graphgen/models/searcher/db/uniprot_searcher.py index 899b715d..ca32ff0e 100644 --- a/graphgen/models/searcher/db/uniprot_searcher.py +++ b/graphgen/models/searcher/db/uniprot_searcher.py @@ -68,7 +68,7 @@ def get_by_accession(self, accession: str) -> Optional[dict]: @staticmethod def _swissprot_to_dict(record: SwissProt.Record) -> dict: - """error + """ Convert a SwissProt.Record to a dictionary. """ functions = [] @@ -182,6 +182,13 @@ def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: def _local_blast(self, seq: str, threshold: float) -> Optional[str]: """ Perform local BLAST search using local BLAST database. + Optimized with multi-threading and faster output format. + """ + try: + with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp: + tmp.write(f">query\n{seq}\n") + tmp_name = tmp.name + # Optimized BLAST command with: # - num_threads: Use multiple threads for faster search # - outfmt 6 sacc: Only return accession (minimal output) diff --git a/graphgen/models/storage/kv/json_storage.py b/graphgen/models/storage/kv/json_storage.py index aa2bce19..4af8d1e5 100644 --- a/graphgen/models/storage/kv/json_storage.py +++ b/graphgen/models/storage/kv/json_storage.py @@ -1,5 +1,48 @@ import os from dataclasses import dataclass +from typing import Iterator, Tuple + +from graphgen.bases.base_storage import BaseKVStorage +from graphgen.utils import load_json, write_json + + +@dataclass +class JsonKVStorage(BaseKVStorage): + _data: dict[str, dict] = None + + def __post_init__(self): + self._file_name = os.path.join(self.working_dir, f"{self.namespace}.json") + self._data = load_json(self._file_name) or {} + print(f"Load KV {self.namespace} with {len(self._data)} data") + + @property + def data(self): + return self._data + + def all_keys(self) -> list[str]: + return list(self._data.keys()) + + def index_done_callback(self): + write_json(self._data, self._file_name) + + def get_by_id(self, id): + return self._data.get(id, None) + + def get_by_ids(self, ids, fields=None) -> list: + if fields is None: + return [self._data.get(id, None) for id in ids] + return [ + ( + {k: v for k, v in self._data[id].items() if k in fields} + if self._data.get(id, None) + else None + ) + for id in ids + ] + + def get_all(self) -> dict[str, dict]: + return self._data + def iter_items(self) -> Iterator[Tuple[str, dict]]: """ Iterate over all items without loading everything into memory at once. diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py index d8e78927..5bb1261a 100644 --- a/graphgen/operators/__init__.py +++ b/graphgen/operators/__init__.py @@ -5,7 +5,7 @@ from .judge import JudgeService from .partition import PartitionService from .quiz import QuizService -from .read import read, read_files +from .read import read from .search import SearchService operators = { diff --git a/graphgen/operators/build_kg/build_kg_service.py b/graphgen/operators/build_kg/build_kg_service.py index aabba3b9..6394dac2 100644 --- a/graphgen/operators/build_kg/build_kg_service.py +++ b/graphgen/operators/build_kg/build_kg_service.py @@ -8,6 +8,10 @@ from graphgen.utils import logger from .build_mm_kg import build_mm_kg +from .build_omics_kg import build_omics_kg + + +class BuildKGService(BaseOperator): def __init__(self, working_dir: str = "cache"): super().__init__(working_dir=working_dir, op_name="build_kg_service") self.llm_client: BaseLLMWrapper = init_llm("synthesizer") @@ -33,6 +37,31 @@ def build_kg(self, chunks: List[Chunk]) -> None: for chunk in chunks if chunk.type in ("image", "video", "table", "formula") ] + omics_chunks = [ + chunk + for chunk in chunks + if chunk.type in ("dna", "rna", "protein") + ] + + if len(text_chunks) == 0: + logger.info("All text chunks are already in the storage") + else: + logger.info("[Text Entity and Relation Extraction] processing ...") + # Note: build_text_kg is not imported, keeping omics processing only for now + # build_text_kg( + # llm_client=self.llm_client, + # kg_instance=self.graph_storage, + # chunks=text_chunks, + # ) + if len(mm_chunks) == 0: + logger.info("All multi-modal chunks are already in the storage") + else: + logger.info("[Multi-modal Entity and Relation Extraction] processing ...") + build_mm_kg( + llm_client=self.llm_client, + kg_instance=self.graph_storage, + chunks=mm_chunks, + ) if len(omics_chunks) == 0: logger.info("All omics chunks are already in the storage") else: diff --git a/graphgen/operators/build_kg/build_omics_kg.py b/graphgen/operators/build_kg/build_omics_kg.py new file mode 100644 index 00000000..503bb7eb --- /dev/null +++ b/graphgen/operators/build_kg/build_omics_kg.py @@ -0,0 +1,51 @@ +from collections import defaultdict +from typing import List + +from graphgen.bases import BaseLLMWrapper +from graphgen.bases.base_storage import BaseGraphStorage +from graphgen.bases.datatypes import Chunk +from graphgen.models import OmicsKGBuilder +from graphgen.utils import run_concurrent + + +def build_omics_kg( + llm_client: BaseLLMWrapper, + kg_instance: BaseGraphStorage, + chunks: List[Chunk], +): + """ + Build knowledge graph from multi-omics chunks (DNA, RNA, protein). + + :param llm_client: Synthesizer LLM model to extract entities and relationships + :param kg_instance: Graph storage instance + :param chunks: List of omics chunks + :return: None + """ + kg_builder = OmicsKGBuilder(llm_client=llm_client, max_loop=3) + + results = run_concurrent( + kg_builder.extract, + chunks, + desc="[2/4] Extracting entities and relationships from omics chunks", + unit="chunk", + ) + + nodes = defaultdict(list) + edges = defaultdict(list) + for n, e in results: + for k, v in n.items(): + nodes[k].extend(v) + for k, v in e.items(): + edges[tuple(sorted(k))].extend(v) + + run_concurrent( + lambda kv: kg_builder.merge_nodes(kv, kg_instance=kg_instance), + list(nodes.items()), + desc="Inserting omics entities into storage", + ) + + run_concurrent( + lambda kv: kg_builder.merge_edges(kv, kg_instance=kg_instance), + list(edges.items()), + desc="Inserting omics relationships into storage", + ) diff --git a/graphgen/operators/generate/generate_service.py b/graphgen/operators/generate/generate_service.py index 720b8488..7ad52dec 100644 --- a/graphgen/operators/generate/generate_service.py +++ b/graphgen/operators/generate/generate_service.py @@ -7,6 +7,37 @@ AtomicGenerator, CoTGenerator, MultiHopGenerator, + OmicsQAGenerator, + VQAGenerator, +) +from graphgen.utils import logger, run_concurrent + + +class GenerateService(BaseOperator): + """ + Generate question-answer pairs based on nodes and edges. + """ + + def __init__( + self, + working_dir: str = "cache", + method: str = "aggregated", + data_format: str = "ChatML", + ): + super().__init__(working_dir=working_dir, op_name="generate_service") + self.llm_client: BaseLLMWrapper = init_llm("synthesizer") + + self.method = method + self.data_format = data_format + + if self.method == "atomic": + self.generator = AtomicGenerator(self.llm_client) + elif self.method == "aggregated": + self.generator = AggregatedGenerator(self.llm_client) + elif self.method == "multi_hop": + self.generator = MultiHopGenerator(self.llm_client) + elif self.method == "cot": + self.generator = CoTGenerator(self.llm_client) elif self.method == "omics_qa": self.generator = OmicsQAGenerator(self.llm_client) elif self.method in ["vqa"]: diff --git a/graphgen/operators/partition/partition_service.py b/graphgen/operators/partition/partition_service.py index a2d55213..dbb52b92 100644 --- a/graphgen/operators/partition/partition_service.py +++ b/graphgen/operators/partition/partition_service.py @@ -2,6 +2,7 @@ import os from typing import Iterable +import numpy as np import pandas as pd from graphgen.bases import BaseGraphStorage, BaseKVStorage, BaseOperator, BaseTokenizer @@ -136,6 +137,15 @@ def _attach_additional_data_to_node(self, batch: tuple) -> tuple: for node_id, node_data in nodes_data: entity_type = (node_data.get("entity_type") or "").lower() + if not entity_type: + continue + + source_ids = [ + sid.strip() + for sid in node_data.get("source_id", "").split("") + if sid.strip() + ] + if not source_ids: continue @@ -153,5 +163,125 @@ def _attach_additional_data_to_node(self, batch: tuple) -> tuple: node_data["image_data"] = json.loads(image_chunks[0]["content"]) logger.debug("Attached image data to node %s", node_id) + # Handle omics data (protein/dna/rna) + molecule_type = None + if entity_type in ("protein", "dna", "rna"): + molecule_type = entity_type + else: + # Infer from source_id prefix + for sid in source_ids: + sid_lower = sid.lower() + if sid_lower.startswith("protein-"): + molecule_type = "protein" + break + if sid_lower.startswith("dna-"): + molecule_type = "dna" + break + if sid_lower.startswith("rna-"): + molecule_type = "rna" + break + + if molecule_type: + omics_chunks = [ + data + for sid in source_ids + if (data := self.chunk_storage.get_by_id(sid)) + ] + + if not omics_chunks: + logger.warning( + "No chunks found for node %s (type: %s) with source_ids: %s", + node_id, molecule_type, source_ids + ) + continue + + def get_chunk_value(chunk: dict, field: str): + # First check root level of chunk + if field in chunk: + return chunk[field] + # Then check metadata if it exists and is a dict + chunk_metadata = chunk.get("metadata") + if isinstance(chunk_metadata, dict) and field in chunk_metadata: + return chunk_metadata[field] + return None + + # Group chunks by molecule type to preserve all types of sequences + chunks_by_type = {"dna": [], "rna": [], "protein": []} + for chunk in omics_chunks: + chunk_id = chunk.get("_chunk_id", "").lower() + if chunk_id.startswith("dna-"): + chunks_by_type["dna"].append(chunk) + elif chunk_id.startswith("rna-"): + chunks_by_type["rna"].append(chunk) + elif chunk_id.startswith("protein-"): + chunks_by_type["protein"].append(chunk) + + # Field mappings for each molecule type + field_mapping = { + "protein": [ + "protein_name", "gene_names", "organism", "function", + "sequence", "id", "database", "entry_name", "uniprot_id" + ], + "dna": [ + "gene_name", "gene_description", "organism", "chromosome", + "genomic_location", "function", "gene_type", "sequence", + "id", "database" + ], + "rna": [ + "rna_type", "description", "organism", "related_genes", + "gene_name", "so_term", "sequence", "id", "database", + "rnacentral_id" + ], + } + + # Extract and store captions for each molecule type + for mol_type in ["dna", "rna", "protein"]: + type_chunks = chunks_by_type[mol_type] + if not type_chunks: + continue + + # Use the first chunk of this type + type_chunk = type_chunks[0] + caption = {} + + # Extract all relevant fields for this molecule type + for field in field_mapping.get(mol_type, []): + value = get_chunk_value(type_chunk, field) + # Handle numpy arrays properly - check size instead of truthiness + if isinstance(value, np.ndarray): + if value.size > 0: + caption[field] = value.tolist() # Convert to list for compatibility + elif value: # For other types, use normal truthiness check + caption[field] = value + + # Store caption if it has any data + if caption: + caption_key = f"{mol_type}_caption" + node_data[caption_key] = caption + logger.debug("Stored %s caption for node %s with %d fields", mol_type, node_id, len(caption)) + + # For backward compatibility, also attach sequence and other fields from the primary molecule type + # Use the detected molecule_type or default to the first available type + primary_chunk = None + if chunks_by_type.get(molecule_type): + primary_chunk = chunks_by_type[molecule_type][0] + elif chunks_by_type["dna"]: + primary_chunk = chunks_by_type["dna"][0] + elif chunks_by_type["rna"]: + primary_chunk = chunks_by_type["rna"][0] + elif chunks_by_type["protein"]: + primary_chunk = chunks_by_type["protein"][0] + else: + primary_chunk = omics_chunks[0] + + # Attach sequence if not already present (for backward compatibility) + if "sequence" not in node_data: + sequence = get_chunk_value(primary_chunk, "sequence") + # Handle numpy arrays properly + if isinstance(sequence, np.ndarray): + if sequence.size > 0: + node_data["sequence"] = sequence.tolist() # Convert to list for compatibility + elif sequence: # For other types, use normal truthiness check + node_data["sequence"] = sequence return nodes_data, edges_data diff --git a/graphgen/operators/read/__init__.py b/graphgen/operators/read/__init__.py index cd22453e..cda44587 100644 --- a/graphgen/operators/read/__init__.py +++ b/graphgen/operators/read/__init__.py @@ -1 +1 @@ -from .read import read, read_files +from .read import read diff --git a/graphgen/operators/read/read.py b/graphgen/operators/read/read.py index e05f7e08..fbed377e 100644 --- a/graphgen/operators/read/read.py +++ b/graphgen/operators/read/read.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Any, Dict, Iterator, List, Optional, Union +from typing import Any, List, Optional, Union import ray @@ -126,4 +126,3 @@ def read( except Exception as e: logger.error("[READ] Failed to read files from %s: %s", input_path, e) raise - diff --git a/graphgen/operators/search/multi_omics_search.py b/graphgen/operators/search/multi_omics_search.py new file mode 100644 index 00000000..fbe10f06 --- /dev/null +++ b/graphgen/operators/search/multi_omics_search.py @@ -0,0 +1,29 @@ +import re +from typing import Dict, Optional + +from graphgen.models import UniProtSearch + + +def _fetch_uniprot(entry: str) -> Optional[Dict]: + entry = entry.strip() + client = UniProtSearch() + + # 1. first try accession search + if re.fullmatch( + r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}", entry + ): + return client.get_by_accession(entry) + + # 2. then try keyword search + return client.get_best_hit(entry) + + +def multi_omics_search(entry: str) -> Dict: + """ + Multi-omics search function that tries to fetch protein/gene information. + """ + # TODO: Extend this function to include more omics databases as needed. + result = _fetch_uniprot(entry) + if result: + return {"input": entry, "uniprot": result} + return {"input": entry, "uniprot": None} diff --git a/graphgen/operators/search/search_all.py b/graphgen/operators/search/search_all.py deleted file mode 100644 index 6017cfee..00000000 --- a/graphgen/operators/search/search_all.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -To use Google Web Search API, -follow the instructions [here](https://developers.google.com/custom-search/v1/overview) -to get your Google searcher api key. - -To use Bing Web Search API, -follow the instructions [here](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) -and obtain your Bing subscription key. -""" - - -from graphgen.utils import logger, run_concurrent - - -async def search_all( - seed_data: dict, - search_config: dict, -) -> dict: - """ - Perform searches across multiple search types and aggregate the results. - :param seed_data: A dictionary containing seed data with entity names. - :param search_config: A dictionary specifying which data sources to use for searching. - :return: A dictionary with - """ - - results = {} - data_sources = search_config.get("data_sources", []) - - for data_source in data_sources: - data = list(seed_data.values()) - data = [d["content"] for d in data if "content" in d] - data = list(set(data)) # Remove duplicates - - if data_source == "uniprot": - from graphgen.models import UniProtSearch - - uniprot_search_client = UniProtSearch( - **search_config.get("uniprot_params", {}) - ) - - uniprot_results = await run_concurrent( - uniprot_search_client.search, - data, - desc="Searching UniProt database", - unit="keyword", - ) - results[data_source] = uniprot_results - - elif data_source == "ncbi": - from graphgen.models import NCBISearch - - ncbi_search_client = NCBISearch( - **search_config.get("ncbi_params", {}) - ) - - ncbi_results = await run_concurrent( - ncbi_search_client.search, - data, - desc="Searching NCBI database", - unit="keyword", - ) - results[data_source] = ncbi_results - - elif data_source == "rnacentral": - from graphgen.models import RNACentralSearch - - rnacentral_search_client = RNACentralSearch( - **search_config.get("rnacentral_params", {}) - ) - - rnacentral_results = await run_concurrent( - rnacentral_search_client.search, - data, - desc="Searching RNAcentral database", - unit="keyword", - ) - results[data_source] = rnacentral_results - - else: - logger.error("Data source %s not supported.", data_source) - continue - - return results diff --git a/graphgen/operators/search/search_service.py b/graphgen/operators/search/search_service.py new file mode 100644 index 00000000..60cbf42d --- /dev/null +++ b/graphgen/operators/search/search_service.py @@ -0,0 +1,428 @@ +""" +To use Google Web Search API, +follow the instructions [here](https://developers.google.com/custom-search/v1/overview) +to get your Google searcher api key. + +To use Bing Web Search API, +follow the instructions [here](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) +and obtain your Bing subscription key. +""" + +import pandas as pd +import numpy as np + +from graphgen.bases import BaseOperator +from graphgen.utils import create_event_loop, run_concurrent + + +class SearchService(BaseOperator): + """ + Service class for performing searches across multiple data sources. + Provides search functionality for UniProt, NCBI, and RNAcentral databases. + """ + + def __init__( + self, + working_dir: str = "cache", + data_sources: list = None, + ncbi_params: dict = None, + uniprot_params: dict = None, + rnacentral_params: dict = None, + save_interval: int = 1000, + **kwargs, + ): + super().__init__(working_dir=working_dir, op_name="search_service") + self.working_dir = working_dir + + # Build search_config dictionary from parameters + self.search_config = { + "data_sources": data_sources or [], + } + + if ncbi_params: + self.search_config["ncbi_params"] = ncbi_params + if uniprot_params: + self.search_config["uniprot_params"] = uniprot_params + if rnacentral_params: + self.search_config["rnacentral_params"] = rnacentral_params + + self.save_interval = save_interval + self.search_storage = None # Optional: can be initialized if needed for saving intermediate results + + async def _perform_searches(self, seed_data: dict) -> dict: + """ + Internal method to perform searches across multiple search types and aggregate the results. + :param seed_data: A dictionary containing seed data with entity names. + :return: A dictionary with search results + """ + results = {} + data_sources = self.search_config.get("data_sources", []) + + for data_source in data_sources: + data = list(seed_data.values()) + data = [d["content"] for d in data if "content" in d] + data = list(set(data)) # Remove duplicates + + # Prepare save callback for this data source + def make_save_callback(source_name): + def save_callback(intermediate_results, completed_count): + """Save intermediate search results.""" + if self.search_storage is None: + return + + # Convert results list to dict format + # Results are tuples of (query, result_dict) or just result_dict + batch_results = {} + for result in intermediate_results: + if result is None: + continue + # Check if result is a dict with _search_query key + if isinstance(result, dict) and "_search_query" in result: + query = result["_search_query"] + # Create a key for the result (using query as key) + key = f"{source_name}:{query}" + batch_results[key] = result + elif isinstance(result, dict): + # If no _search_query, use a generated key + key = f"{source_name}:{completed_count}" + batch_results[key] = result + + if batch_results: + # Filter out already existing keys + new_keys = self.search_storage.filter_keys(list(batch_results.keys())) + new_results = {k: v for k, v in batch_results.items() if k in new_keys} + if new_results: + self.search_storage.upsert(new_results) + self.search_storage.index_done_callback() + self.logger.debug("Saved %d intermediate results for %s", len(new_results), source_name) + + return save_callback + + if data_source == "uniprot": + from graphgen.models import UniProtSearch + + uniprot_params = self.search_config.get("uniprot_params", {}).copy() + # Get max_concurrent from config before passing params to constructor + max_concurrent = uniprot_params.pop("max_concurrent", None) + + uniprot_search_client = UniProtSearch( + working_dir=self.working_dir, + **uniprot_params + ) + + uniprot_results = await run_concurrent( + uniprot_search_client.search, + data, + desc="Searching UniProt database", + unit="keyword", + save_interval=self.save_interval if self.save_interval > 0 else 0, + save_callback=( + make_save_callback("uniprot") + if self.search_storage and self.save_interval > 0 + else None + ), + max_concurrent=max_concurrent, + ) + results[data_source] = uniprot_results + + elif data_source == "ncbi": + from graphgen.models import NCBISearch + + ncbi_params = self.search_config.get("ncbi_params", {}).copy() + # Get max_concurrent from config before passing params to constructor + max_concurrent = ncbi_params.pop("max_concurrent", None) + + ncbi_search_client = NCBISearch( + working_dir=self.working_dir, + **ncbi_params + ) + + ncbi_results = await run_concurrent( + ncbi_search_client.search, + data, + desc="Searching NCBI database", + unit="keyword", + save_interval=self.save_interval if self.save_interval > 0 else 0, + save_callback=( + make_save_callback("ncbi") + if self.search_storage and self.save_interval > 0 + else None + ), + max_concurrent=max_concurrent, + ) + results[data_source] = ncbi_results + + elif data_source == "rnacentral": + from graphgen.models import RNACentralSearch + + rnacentral_params = self.search_config.get("rnacentral_params", {}).copy() + # Get max_concurrent from config before passing params to constructor + max_concurrent = rnacentral_params.pop("max_concurrent", None) + + rnacentral_search_client = RNACentralSearch( + working_dir=self.working_dir, + **rnacentral_params + ) + + rnacentral_results = await run_concurrent( + rnacentral_search_client.search, + data, + desc="Searching RNAcentral database", + unit="keyword", + save_interval=self.save_interval if self.save_interval > 0 else 0, + save_callback=( + make_save_callback("rnacentral") + if self.search_storage and self.save_interval > 0 + else None + ), + max_concurrent=max_concurrent, + ) + results[data_source] = rnacentral_results + + else: + self.logger.error("Data source %s not supported.", data_source) + continue + + return results + + def _is_already_searched(self, doc: dict) -> bool: + """ + Check if a document already contains search results. + + :param doc: Document dictionary + :return: True if document appears to already contain search results + """ + # Check for data_source field (added by search_service) + if "data_source" in doc and doc["data_source"]: + return True + + # Check for database field (added by search operations) + if "database" in doc and doc["database"] in ["UniProt", "NCBI", "RNAcentral"]: + # Also check for molecule_type to confirm it's a search result + if "molecule_type" in doc and doc["molecule_type"] in ["DNA", "RNA", "protein"]: + return True + + # Check for search-specific fields that indicate search results + search_indicators = [ + "uniprot_id", "entry_name", # UniProt + "gene_id", "gene_name", "chromosome", # NCBI + "rnacentral_id", "rna_type", # RNAcentral + ] + if any(indicator in doc for indicator in search_indicators): + # Make sure it's not just metadata by checking for database or molecule_type + if "database" in doc or "molecule_type" in doc: + return True + + return False + + def _normalize_searched_data(self, doc: dict) -> dict: + """ + Normalize a document that already contains search results to the expected format. + + :param doc: Document dictionary with search results + :return: Normalized document dictionary + """ + # Ensure required fields exist + doc_id = doc.get("_doc_id") + if not doc_id: + # Generate doc_id from id or other fields + raw_doc_id = doc.get("id") or doc.get("_search_query") or f"doc-{hash(str(doc))}" + doc_id = str(raw_doc_id) + + # Ensure doc_id starts with "doc-" prefix + if not doc_id.startswith("doc-"): + doc_id = f"doc-{doc_id}" + + # Determine document type from molecule_type or existing type + doc_type = doc.get("type", "text") + if doc_type == "text" and "molecule_type" in doc: + molecule_type = doc.get("molecule_type", "").lower() + if molecule_type in ["dna", "rna", "protein"]: + doc_type = molecule_type + + # Ensure data_source field exists + data_source = doc.get("data_source") + if not data_source: + # Infer from database field + database = doc.get("database", "").lower() + if "uniprot" in database: + data_source = "uniprot" + elif "ncbi" in database: + data_source = "ncbi" + elif "rnacentral" in database or "rna" in database: + data_source = "rnacentral" + + # Build or preserve content field + content = doc.get("content") + if not content or content.strip() == "": + # Build content from available fields if missing + content_parts = [] + if doc.get("title"): + content_parts.append(f"Title: {doc['title']}") + if doc.get("description"): + content_parts.append(f"Description: {doc['description']}") + if doc.get("function"): + func = doc["function"] + if isinstance(func, list): + func = ", ".join(str(f) for f in func) + content_parts.append(f"Function: {func}") + if doc.get("sequence"): + content_parts.append(f"Sequence: {doc['sequence']}") + + if not content_parts: + # Fallback: create content from key fields + key_fields = ["protein_name", "gene_name", "gene_description", "organism"] + for field in key_fields: + if field in doc and doc[field]: + content_parts.append(f"{field}: {doc[field]}") + + content = "\n".join(content_parts) if content_parts else str(doc) + + # Create normalized row + normalized_doc = { + "_doc_id": doc_id, + "type": doc_type, + "content": content, + "data_source": data_source, + **doc, # Include all original fields for metadata + } + + return normalized_doc + + def process(self, batch: pd.DataFrame) -> pd.DataFrame: + """ + Process a batch of documents and perform searches. + This is the Ray Data operator interface. + + If input data already contains search results (detected by presence of + data_source, database, or search-specific fields), the search step is + skipped and the data is normalized and returned directly. + + :param batch: DataFrame containing documents with at least '_doc_id' and 'content' columns + :return: DataFrame containing search results + """ + # Convert DataFrame to dictionary format + docs = batch.to_dict(orient="records") + + # Check if data already contains search results + already_searched = all(self._is_already_searched(doc) for doc in docs if doc) + + if already_searched: + # Data already contains search results, normalize and return directly + self.logger.info( + "Input data already contains search results. " + "Skipping search step and normalizing data." + ) + result_rows = [] + for doc in docs: + if not doc: + continue + normalized_doc = self._normalize_searched_data(doc) + result_rows.append(normalized_doc) + + if not result_rows: + self.logger.warning("No documents found in batch") + return pd.DataFrame(columns=["_doc_id", "type", "content", "data_source"]) + + return pd.DataFrame(result_rows) + + # Data doesn't contain search results, perform search as usual + seed_data = {doc.get("_doc_id", f"doc-{i}"): doc for i, doc in enumerate(docs)} + + # Perform searches asynchronously + loop, created = create_event_loop() + try: + if loop.is_running(): + # If loop is already running, we can't use run_until_complete + # This shouldn't happen in normal usage, but handle it gracefully + raise RuntimeError( + "Cannot use process when event loop is already running. " + "This is likely a Ray worker configuration issue." + ) + search_results = loop.run_until_complete( + self._perform_searches(seed_data) + ) + finally: + # Only close the loop if we created it + if created: + loop.close() + + # Convert search_results from {data_source: [results]} to DataFrame + # Each result becomes a document row compatible with chunk service + result_rows = [] + + for data_source, result_list in search_results.items(): + if not isinstance(result_list, list): + continue + + for result in result_list: + if result is None: + continue + + # Convert search result to document format expected by chunk service + # Build content from available fields + content_parts = [] + if result.get("title"): + content_parts.append(f"Title: {result['title']}") + if result.get("description"): + content_parts.append(f"Description: {result['description']}") + if result.get("function"): + content_parts.append(f"Function: {result['function']}") + if result.get("sequence"): + content_parts.append(f"Sequence: {result['sequence']}") + + # If no content parts, use a default or combine all fields + if not content_parts: + # Fallback: create content from all string fields + content_parts = [ + f"{k}: {v}" + for k, v in result.items() + if isinstance(v, (str, int, float)) and k != "_search_query" + ] + + content = "\n".join(content_parts) if content_parts else str(result) + + # Determine document type from molecule_type or default to "text" + doc_type = result.get("molecule_type", "text").lower() + if doc_type not in ["text", "dna", "rna", "protein"]: + doc_type = "text" + + # Convert to string to handle Ray Data ListElement and other types + raw_doc_id = result.get("id") or result.get("_search_query") or f"search-{len(result_rows)}" + doc_id = str(raw_doc_id) + + # Ensure doc_id starts with "doc-" prefix + if not doc_id.startswith("doc-"): + doc_id = f"doc-{doc_id}" + + # Convert numpy arrays and complex types to Python-native types + # to avoid Ray Data tensor extension casting issues + def clean_value(v): + """Recursively convert numpy arrays and other problematic types to Python-native types.""" + if isinstance(v, np.ndarray): + return v.tolist() + elif isinstance(v, (list, tuple)): + return [clean_value(item) for item in v] + elif isinstance(v, dict): + return {k: clean_value(val) for k, val in v.items()} + else: + return v + + cleaned_result = {k: clean_value(v) for k, v in result.items()} + + # Create document row with all result fields plus required fields + row = { + "_doc_id": doc_id, + "type": doc_type, + "content": content, + "data_source": data_source, + **cleaned_result, # Include all original result fields for metadata + } + result_rows.append(row) + + if not result_rows: + self.logger.warning("No search results generated for this batch") + # Return empty DataFrame with expected structure + return pd.DataFrame(columns=["_doc_id", "type", "content", "data_source"]) + + return pd.DataFrame(result_rows) diff --git a/graphgen/run.py b/graphgen/run.py index e0efe7ea..a140a35f 100644 --- a/graphgen/run.py +++ b/graphgen/run.py @@ -1,7 +1,5 @@ import argparse -import logging import os -import sys import time from importlib import resources from typing import Any, Dict @@ -20,7 +18,74 @@ load_dotenv() - console_level=logging.ERROR, + +def set_working_dir(folder): + os.makedirs(folder, exist_ok=True) + + +def save_config(config_path, global_config): + if not os.path.exists(os.path.dirname(config_path)): + os.makedirs(os.path.dirname(config_path)) + with open(config_path, "w", encoding="utf-8") as config_file: + yaml.dump( + global_config, config_file, default_flow_style=False, allow_unicode=True + ) + + +class NodeFilenameProvider(FilenameProvider): + def __init__(self, node_id: str): + self.node_id = node_id + + def get_filename_for_block( + self, block: Block, write_uuid: str, task_index: int, block_index: int + ) -> str: + # format: {node_id}_{write_uuid}_{task_index:06}_{block_index:06}.json + return f"{self.node_id}_{write_uuid}_{task_index:06d}_{block_index:06d}.jsonl" + + def get_filename_for_row( + self, + row: Dict[str, Any], + write_uuid: str, + task_index: int, + block_index: int, + row_index: int, + ) -> str: + raise NotImplementedError( + f"Row-based filenames are not supported by write_json. " + f"Node: {self.node_id}, write_uuid: {write_uuid}" + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config_file", + help="Config parameters for GraphGen.", + default=resources.files("graphgen") + .joinpath("configs") + .joinpath("aggregated_config.yaml"), + type=str, + ) + parser.add_argument( + "--output_dir", + help="Output directory for GraphGen results.", + default=None, + type=str, + ) + + args = parser.parse_args() + + with open(args.config_file, "r", encoding="utf-8") as f: + config = yaml.load(f, Loader=yaml.FullLoader) + + working_dir = args.output_dir or config.get("global_params", {}).get("working_dir", "cache") + unique_id = int(time.time()) + output_path = os.path.join(working_dir, "output", f"{unique_id}") + set_working_dir(output_path) + log_path = os.path.join(working_dir, "logs", "Driver.log") + driver_logger = set_logger( + log_path, + name="GraphGen", if_stream=True, ) CURRENT_LOGGER_VAR.set(driver_logger) @@ -30,53 +95,26 @@ log_path, ) - # Temporarily suppress non-error output (print statements, third-party libraries, Ray Data progress) - # Only redirect stdout to preserve stderr for logger error output - global _devnull - _devnull = open(os.devnull, 'w', encoding='utf-8') - sys.stdout = _devnull - - try: - engine = Engine(config, operators) - ds = ray.data.from_items([]) - results = engine.execute(ds) - - for node_id, dataset in results.items(): - node_output_path = os.path.join(output_path, f"{node_id}") - os.makedirs(node_output_path, exist_ok=True) - dataset.write_json( - node_output_path, - filename_provider=NodeFilenameProvider(node_id), - pandas_json_args_fn=lambda: { - "force_ascii": False, - "orient": "records", - "lines": True, - }, - ) - logger.info("Node %s results saved to %s", node_id, node_output_path) - - save_config(os.path.join(output_path, "config.yaml"), config) - logger.info("GraphGen completed successfully. Data saved to %s", output_path) - finally: - # Restore original stdout before printing results - sys.stdout = _original_stdout - if _devnull: - _devnull.close() - _devnull = None - - # Print save information to console - if 'results' in locals() and results: - print("\n" + "="*60) - print("GraphGen execution completed successfully!") - print("="*60) - for node_id, dataset in results.items(): - node_output_path = os.path.join(output_path, f"{node_id}") - print(f"✓ Node '{node_id}' results saved to: {node_output_path}") - print(f"✓ Config saved to: {os.path.join(output_path, 'config.yaml')}") - print(f"✓ Logs saved to: {log_path}") - print("="*60 + "\n") - else: - print("\n⚠️ Warning: No results were generated.\n") + engine = Engine(config, operators) + ds = ray.data.from_items([]) + results = engine.execute(ds) + + for node_id, dataset in results.items(): + output_path = os.path.join(output_path, f"{node_id}") + os.makedirs(output_path, exist_ok=True) + dataset.write_json( + output_path, + filename_provider=NodeFilenameProvider(node_id), + pandas_json_args_fn=lambda: { + "force_ascii": False, + "orient": "records", + "lines": True, + }, + ) + logger.info("Node %s results saved to %s", node_id, output_path) + + save_config(os.path.join(output_path, "config.yaml"), config) + logger.info("GraphGen completed successfully. Data saved to %s", output_path) if __name__ == "__main__": diff --git a/graphgen/templates/generation/__init__.py b/graphgen/templates/generation/__init__.py index 7e967b7b..22c3797a 100644 --- a/graphgen/templates/generation/__init__.py +++ b/graphgen/templates/generation/__init__.py @@ -2,5 +2,5 @@ from .atomic_generation import ATOMIC_GENERATION_PROMPT from .cot_generation import COT_GENERATION_PROMPT from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT - +from .omics_qa_generation import OMICS_QA_GENERATION_PROMPT from .vqa_generation import VQA_GENERATION_PROMPT diff --git a/graphgen/templates/generation/omics_qa_generation.py b/graphgen/templates/generation/omics_qa_generation.py new file mode 100644 index 00000000..1d356e62 --- /dev/null +++ b/graphgen/templates/generation/omics_qa_generation.py @@ -0,0 +1,99 @@ +# pylint: disable=C0301 +OMICS_QA_TEMPLATE_EN: str = """You are a senior computational biologist specializing in multi-omics data analysis (genomics, transcriptomics, proteomics). Your task is to generate logically coherent, verifiable and non-hallucinated question-answer pairs for the given biological sample described by the provided ENTITIES and RELATIONSHIPS. +Use English as the output language. + +---Objectives--- +Create multiple sets of omics-centric QA pairs that satisfy the following: +1. Only ask about objectively existing facts in the provided data (e.g., gene names, sequence information, functional annotations, regulatory elements, structural features, experimental metadata, etc.). Avoid subjective or speculative questions. +2. Ensure that each question has a single, clear and verifiable answer that can be directly confirmed from the given entities/relationships. +3. Questions should cover diverse aspects: sequence, structure, function, interactions, regulation, experimental annotations, etc. +4. Avoid repetitive questions; each question must be unique and meaningful. +5. Use concise, unambiguous language; do not invent information beyond the provided data. + +---Instructions--- +1. Carefully analyse the supplied ENTITIES and RELATIONSHIPS to identify: + - Biological entities (genes, proteins, RNA molecules, regulatory elements, pathways, etc.) + - Sequence information (DNA sequences, RNA sequences, protein sequences) + - Functional annotations (gene function, protein function, RNA function, biological processes) + - Structural features (chromosomal location, genomic coordinates, domain structures, etc.) + - Regulatory relationships (transcription, translation, regulation, interaction) + - Experimental metadata (database IDs, organism, experimental methods, etc.) +2. Organise information logically: + - Start with sequence/primary structure information + - Proceed to functional annotations and biological roles + - Include regulatory relationships and interactions + - End with experimental context and metadata +3. Maintain scientific accuracy and consistent nomenclature (standard gene names, sequence identifiers, etc.). +4. Review each QA pair to guarantee logical consistency and absence of hallucination. + +################ +-ENTITIES- +################ +{entities} + +################ +-RELATIONSHIPS- +################ +{relationships} +################ +Directly output the generated QA pairs below. Do NOT copy any example questions, and do NOT include extraneous text. + +Question: +Answer: + +Question: +Answer: + +""" + +OMICS_QA_TEMPLATE_ZH: str = """你是一位资深的多组学数据计算生物学家(基因组学、转录组学、蛋白质组学)。你的任务是根据下述提供的实体与关系,为给定的生物样本生成逻辑连贯、可验证、无幻觉的中英双语问答对(这里仅输出中文)。 +使用中文作为输出语言。 + +---目标--- +创建多组以组学数据为中心的问答对,满足: +1. 仅询问数据中客观存在的事实(如基因名称、序列信息、功能注释、调控元件、结构特征、实验元数据等),避免主观或推测性问题。 +2. 每个问题必须有单一、明确且可直接验证的答案,答案必须能从给定实体/关系中直接确认。 +3. 问题需覆盖:序列、结构、功能、相互作用、调控、实验注释等多个维度,确保多样性与全面性。 +4. 避免重复提问,每个问题都独特且有意义。 +5. 语言简洁、无歧义,严禁编造超出给定数据的信息。 + +---说明--- +1. 仔细分析提供的实体与关系,识别: + - 生物实体(基因、蛋白质、RNA分子、调控元件、通路等) + - 序列信息(DNA序列、RNA序列、蛋白质序列) + - 功能注释(基因功能、蛋白质功能、RNA功能、生物学过程) + - 结构特征(染色体位置、基因组坐标、结构域等) + - 调控关系(转录、翻译、调控、相互作用) + - 实验元数据(数据库ID、生物体、实验方法等) +2. 按逻辑顺序组织信息: + - 从序列/一级结构信息入手 + - 再到功能注释和生物学作用 + - 包括调控关系和相互作用 + - 最后到实验背景和元数据 +3. 保持科学准确性,使用统一命名规范(标准基因名、序列标识符等)。 +4. 检查每对问答,确保逻辑一致且无幻觉。 + +################ +-实体- +################ +{entities} + +################ +-关系- +################ +{relationships} +################ +请直接在下方输出生成的问答对,不要复制任何示例,不要输出无关内容。 + +问题: <问题1> +答案: <答案1> + +问题: <问题2> +答案: <答案2> + +""" + +OMICS_QA_GENERATION_PROMPT = { + "en": OMICS_QA_TEMPLATE_EN, + "zh": OMICS_QA_TEMPLATE_ZH, +} diff --git a/graphgen/templates/kg/__init__.py b/graphgen/templates/kg/__init__.py index 3f8b6ed2..efc2ca31 100644 --- a/graphgen/templates/kg/__init__.py +++ b/graphgen/templates/kg/__init__.py @@ -1,4 +1,5 @@ from .kg_extraction import KG_EXTRACTION_PROMPT from .kg_summarization import KG_SUMMARIZATION_PROMPT from .mm_kg_extraction import MMKG_EXTRACTION_PROMPT - +from .omics_kg_extraction import OMICS_KG_EXTRACTION_PROMPT +from .protein_kg_extraction import PROTEIN_KG_EXTRACTION_PROMPT \ No newline at end of file diff --git a/graphgen/templates/kg/omics_kg_extraction.py b/graphgen/templates/kg/omics_kg_extraction.py new file mode 100644 index 00000000..d105dd38 --- /dev/null +++ b/graphgen/templates/kg/omics_kg_extraction.py @@ -0,0 +1,209 @@ +# pylint: disable=C0301 +TEMPLATE_EN: str = """You are a bioinformatics expert, skilled at analyzing biological sequences (DNA, RNA, protein) and their metadata to extract biological entities and their relationships. + +-Goal- +Given a biological sequence chunk (DNA, RNA, or protein) along with its metadata, identify all relevant biological entities and their relationships. +Use English as output language. + +-Steps- +1. Identify all biological entities. For each identified entity, extract the following information: +- entity_name: Name of the entity (gene name, protein name, RNA name, domain name, etc.), capitalized +- entity_type: One of the following types: [{entity_types}] +- entity_summary: Comprehensive summary of the entity's biological function, structure, or properties +Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *biologically related* to each other. +For each pair of related entities, extract the following information: +- source_entity: name of the source entity, as identified in step 1 +- target_entity: name of the target entity, as identified in step 1 +- relationship_summary: explanation of the biological relationship (e.g., encodes, transcribes, translates, interacts, regulates, homologous_to, located_in, etc.) +Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Identify high-level key words that summarize the main biological concepts, functions, or themes. +Format the content-level key words as ("content_keywords"{tuple_delimiter}) + +4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. + +5. When finished, output {completion_delimiter} + +################ +-Examples- +################ +-Example 1- +Sequence Metadata: +################ +molecule_type: DNA +database: NCBI +id: NG_033923 +gene_name: BRCA1 +gene_description: BRCA1 DNA repair associated +organism: Homo sapiens +gene_type: protein-coding +chromosome: 17 +genomic_location: 43044295-43125483 +function: BRCA1 is a tumor suppressor gene involved in DNA repair +sequence_chunk: ATGCGATCGATCGATCG... (first 500bp of BRCA1 gene) +################ +Output: +("entity"{tuple_delimiter}"BRCA1"{tuple_delimiter}"gene"{tuple_delimiter}"BRCA1 is a protein-coding tumor suppressor gene located on chromosome 17 in humans, involved in DNA repair mechanisms."){record_delimiter} +("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"Human species, the organism in which BRCA1 gene is found."){record_delimiter} +("entity"{tuple_delimiter}"chromosome 17"{tuple_delimiter}"location"{tuple_delimiter}"Chromosome 17 is the chromosomal location of the BRCA1 gene in humans."){record_delimiter} +("entity"{tuple_delimiter}"DNA repair"{tuple_delimiter}"biological_process"{tuple_delimiter}"DNA repair is a biological process in which BRCA1 is involved as a tumor suppressor."){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"BRCA1 is a gene found in Homo sapiens."){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"chromosome 17"{tuple_delimiter}"BRCA1 is located on chromosome 17 in the human genome."){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"DNA repair"{tuple_delimiter}"BRCA1 is involved in DNA repair processes as a tumor suppressor gene."){record_delimiter} +("content_keywords"{tuple_delimiter}"tumor suppressor, DNA repair, genetic disease, cancer genetics"){completion_delimiter} + +-Example 2- +Sequence Metadata: +################ +molecule_type: RNA +database: RNAcentral +id: URS0000000001 +rna_type: miRNA +description: hsa-let-7a-1 microRNA +organism: Homo sapiens +related_genes: ["LIN28", "HMGA2"] +sequence_chunk: CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG +################ +Output: +("entity"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"rna"{tuple_delimiter}"hsa-let-7a-1 is a microRNA (miRNA) found in Homo sapiens, involved in gene regulation."){record_delimiter} +("entity"{tuple_delimiter}"LIN28"{tuple_delimiter}"gene"{tuple_delimiter}"LIN28 is a gene related to hsa-let-7a-1 microRNA, involved in RNA processing and development."){record_delimiter} +("entity"{tuple_delimiter}"HMGA2"{tuple_delimiter}"gene"{tuple_delimiter}"HMGA2 is a gene related to hsa-let-7a-1 microRNA, involved in chromatin structure and gene expression."){record_delimiter} +("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"Human species, the organism in which hsa-let-7a-1 is found."){record_delimiter} +("entity"{tuple_delimiter}"microRNA"{tuple_delimiter}"rna_type"{tuple_delimiter}"MicroRNA is a type of small non-coding RNA involved in post-transcriptional gene regulation."){record_delimiter} +("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"hsa-let-7a-1 is a microRNA found in Homo sapiens."){record_delimiter} +("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"LIN28"{tuple_delimiter}"hsa-let-7a-1 microRNA is related to LIN28 gene, potentially regulating its expression."){record_delimiter} +("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"HMGA2"{tuple_delimiter}"hsa-let-7a-1 microRNA is related to HMGA2 gene, potentially regulating its expression."){record_delimiter} +("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"microRNA"{tuple_delimiter}"hsa-let-7a-1 belongs to the microRNA class of RNA molecules."){record_delimiter} +("content_keywords"{tuple_delimiter}"microRNA, gene regulation, post-transcriptional control, RNA processing"){completion_delimiter} + +-Example 3- +Sequence Metadata: +################ +molecule_type: protein +database: UniProt +id: P01308 +protein_name: Insulin +organism: Homo sapiens +function: ["Regulates glucose metabolism", "Hormone signaling"] +sequence_chunk: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN +################ +Output: +("entity"{tuple_delimiter}"Insulin"{tuple_delimiter}"protein"{tuple_delimiter}"Insulin is a protein hormone in Homo sapiens that regulates glucose metabolism and hormone signaling."){record_delimiter} +("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"Human species, the organism in which Insulin is produced."){record_delimiter} +("entity"{tuple_delimiter}"glucose metabolism"{tuple_delimiter}"biological_process"{tuple_delimiter}"Glucose metabolism is a biological process regulated by Insulin."){record_delimiter} +("entity"{tuple_delimiter}"hormone signaling"{tuple_delimiter}"biological_process"{tuple_delimiter}"Hormone signaling is a biological process in which Insulin participates as a signaling molecule."){record_delimiter} +("relationship"{tuple_delimiter}"Insulin"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"Insulin is a protein produced in Homo sapiens."){record_delimiter} +("relationship"{tuple_delimiter}"Insulin"{tuple_delimiter}"glucose metabolism"{tuple_delimiter}"Insulin regulates glucose metabolism in the body."){record_delimiter} +("relationship"{tuple_delimiter}"Insulin"{tuple_delimiter}"hormone signaling"{tuple_delimiter}"Insulin participates in hormone signaling pathways."){record_delimiter} +("content_keywords"{tuple_delimiter}"hormone, metabolism, glucose regulation, signaling pathway"){completion_delimiter} + +################ +-Real Data- +################ +Entity_types: {entity_types} +Sequence Metadata: {metadata_text} +Sequence Chunk: {sequence_chunk} +################ +Output: +""" + + +TEMPLATE_ZH: str = """你是一个生物信息学专家,擅长分析生物序列(DNA、RNA、蛋白质)及其元数据,提取生物实体及其关系。 + +-目标- +给定一个生物序列片段(DNA、RNA或蛋白质)及其元数据,识别所有相关的生物实体及其关系。 +使用中文作为输出语言。 + +-步骤- +1. 识别所有生物实体。对于每个识别的实体,提取以下信息: + - entity_name:实体的名称(基因名、蛋白质名、RNA名、功能域名等),首字母大写 + - entity_type:以下类型之一:[{entity_types}] + - entity_summary:实体生物学功能、结构或属性的全面总结 + 将每个实体格式化为("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. 从步骤1中识别的实体中,识别所有(源实体,目标实体)对,这些实体彼此之间*在生物学上相关*。 + 对于每对相关的实体,提取以下信息: + - source_entity:步骤1中识别的源实体名称 + - target_entity:步骤1中识别的目标实体名称 + - relationship_summary:生物学关系的解释(例如:编码、转录、翻译、相互作用、调控、同源、位于等) + 将每个关系格式化为("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. 识别总结主要生物学概念、功能或主题的高级关键词。 + 将内容级关键词格式化为("content_keywords"{tuple_delimiter}) + +4. 以中文返回步骤1和2中识别出的所有实体和关系的输出列表。使用**{record_delimiter}**作为列表分隔符。 + +5. 完成后,输出{completion_delimiter} + +################ +-示例- +################ +-示例 1- +序列元数据: +################ +molecule_type: DNA +database: NCBI +id: NG_033923 +gene_name: BRCA1 +gene_description: BRCA1 DNA repair associated +organism: Homo sapiens +gene_type: protein-coding +chromosome: 17 +genomic_location: 43044295-43125483 +function: BRCA1 is a tumor suppressor gene involved in DNA repair +sequence_chunk: ATGCGATCGATCGATCG... (BRCA1基因的前500bp) +################ +输出: +("entity"{tuple_delimiter}"BRCA1"{tuple_delimiter}"gene"{tuple_delimiter}"BRCA1是位于人类17号染色体上的蛋白质编码肿瘤抑制基因,参与DNA修复机制。"){record_delimiter} +("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"人类,BRCA1基因所在的生物体。"){record_delimiter} +("entity"{tuple_delimiter}"17号染色体"{tuple_delimiter}"location"{tuple_delimiter}"17号染色体是BRCA1基因在人类基因组中的位置。"){record_delimiter} +("entity"{tuple_delimiter}"DNA修复"{tuple_delimiter}"biological_process"{tuple_delimiter}"DNA修复是BRCA1作为肿瘤抑制基因参与的生物学过程。"){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"BRCA1是在人类中发现的基因。"){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"17号染色体"{tuple_delimiter}"BRCA1位于人类基因组的17号染色体上。"){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"DNA修复"{tuple_delimiter}"BRCA1作为肿瘤抑制基因参与DNA修复过程。"){record_delimiter} +("content_keywords"{tuple_delimiter}"肿瘤抑制, DNA修复, 遗传疾病, 癌症遗传学"){completion_delimiter} + +################ +-真实数据- +################ +实体类型:{entity_types} +序列元数据:{metadata_text} +序列片段:{sequence_chunk} +################ +输出: +""" + + +CONTINUE_EN: str = """MANY entities and relationships were missed in the last extraction. \ +Add them below using the same format: +""" + +CONTINUE_ZH: str = """很多实体和关系在上一次的提取中可能被遗漏了。请在下面使用相同的格式添加它们:""" + +IF_LOOP_EN: str = """It appears some entities and relationships may have still been missed. \ +Answer YES | NO if there are still entities and relationships that need to be added. +""" + +IF_LOOP_ZH: str = """看起来可能仍然遗漏了一些实体和关系。如果仍有实体和关系需要添加,请回答YES | NO。""" + +OMICS_KG_EXTRACTION_PROMPT: dict = { + "en": { + "TEMPLATE": TEMPLATE_EN, + "CONTINUE": CONTINUE_EN, + "IF_LOOP": IF_LOOP_EN, + }, + "zh": { + "TEMPLATE": TEMPLATE_ZH, + "CONTINUE": CONTINUE_ZH, + "IF_LOOP": IF_LOOP_ZH, + }, + "FORMAT": { + "tuple_delimiter": "<|>", + "record_delimiter": "##", + "completion_delimiter": "<|COMPLETE|>", + "entity_types": "gene, rna, protein, organism, location, biological_process, rna_type, protein_domain, \ +mutation, pathway, disease, function, structure", + }, +} diff --git a/graphgen/templates/kg/protein_kg_extraction.py b/graphgen/templates/kg/protein_kg_extraction.py new file mode 100644 index 00000000..16e9c868 --- /dev/null +++ b/graphgen/templates/kg/protein_kg_extraction.py @@ -0,0 +1,144 @@ +# pylint: disable=C0301 +TEMPLATE_EN: str = """You are an expert in protein science and knowledge-graph construction. +Your task is to extract a star-shaped knowledge graph centered on **a single protein** mentioned in the given text. + +-Goal- +Given free-text that discusses one or more proteins, identify: +1. The **central protein** (the first-mentioned protein or the protein explicitly indicated by the user). +2. All entities that are **directly related** to this central protein. +3. All relationships that **directly link** those entities to the central protein (star edges). + +Use English as the output language. Please provide only the result without any extra explanations. + +-Steps- +1. Identify the **central protein entity** and all **directly-related entities** from the text. + For the **central protein**, extract: + - entity_name: use the full name or UniProt ID if given; capitalized. + - entity_type: always `protein`. + - entity_summary: concise description of its main biological role, location, or significance in the text. + + For each **directly-related entity**, extract: + - entity_name: capitalized. + - entity_type: one of [{entity_types}]. + - entity_summary: comprehensive summary of its attributes/activities **as stated in the text**. + + Format each entity as + ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. From the entities found in Step 1, list every **(central protein → related entity)** pair that is **clearly related**. + For each pair extract: + - source_entity: the **central protein** name. + - target_entity: the related entity name. + - relationship_summary: short explanation of how the central protein is connected to this entity **according to the text**. + + Format each relationship as + ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Output a single list of all entities and relationships from Steps 1–2, using **{record_delimiter}** as the delimiter. + +4. Finish by printing {completion_delimiter} + +################ +-Example- +################ +Text: +################ +The tumor-suppressor protein p53 is a transcription factor that responds to DNA damage. +Phosphorylation of p53 by ATM kinase at serine-15 enhances its stability. +MDM2, an E3 ubiquitin ligase, negatively regulates p53 via ubiquitination. +################ +Output: +("entity"{tuple_delimiter}"p53"{tuple_delimiter}"protein"{tuple_delimiter}"Tumor-suppressor transcription factor that responds to DNA damage and is regulated by post-translational modifications."){record_delimiter} +("entity"{tuple_delimiter}"ATM kinase"{tuple_delimiter}"protein"{tuple_delimiter}"Protein kinase that phosphorylates p53 at serine-15, thereby enhancing p53 stability."){record_delimiter} +("entity"{tuple_delimiter}"serine-15"{tuple_delimiter}"site"{tuple_delimiter}"Phosphorylation site on p53 that is targeted by ATM kinase."){record_delimiter} +("entity"{tuple_delimiter}"MDM2"{tuple_delimiter}"protein"{tuple_delimiter}"E3 ubiquitin ligase that negatively regulates p53 through ubiquitination."){record_delimiter} +("entity"{tuple_delimiter}"DNA damage"{tuple_delimiter}"concept"{tuple_delimiter}"Cellular stress signal that activates p53-mediated transcriptional response."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"ATM kinase"{tuple_delimiter}"ATM kinase phosphorylates p53, enhancing its stability."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"serine-15"{tuple_delimiter}"p53 is phosphorylated at serine-15 by ATM kinase."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"MDM2"{tuple_delimiter}"MDM2 ubiquitinates p53, negatively regulating its activity."){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"DNA damage"{tuple_delimiter}"p53 acts as a sensor-transcription factor in response to DNA damage."){completion_delimiter} + +################ +-Real Data- +Entity_types: {entity_types} +Text: {input_text} +################ +Output: +""" + + +TEMPLATE_ZH: str = """您是蛋白质科学与知识图谱构建专家。 +任务:从给定文本中抽取以**一个中心蛋白质**为核心的星型知识图谱。 + +-目标- +文本可能提及一个或多个蛋白质,请: +1. 确定**中心蛋白质**(文本首个提及或用户指定的蛋白)。 +2. 识别所有与中心蛋白**直接相关**的实体。 +3. 仅保留**中心蛋白→相关实体**的直接关系(星型边)。 + +使用中文输出。请直接给出结果,不要增加额外说明。 + +-步骤- +1. 确定**中心蛋白质实体**及所有**直接相关实体**。 + 对于**中心蛋白质**: + - entity_name:全名或UniProt ID,首字母大写。 + - entity_type:固定为`protein`。 + - entity_summary:简述其在文中的生物学功能、定位或意义。 + + 对于每个**直接相关实体**: + - entity_name:首字母大写。 + - entity_type:可选类型[{entity_types}]。 + - entity_summary:全面总结其在文中与中心蛋白相关的属性/活动。 + + 格式:("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. 在步骤1的实体中,列出所有**(中心蛋白→相关实体)**的明显关系对。 + 每对提取: + - source_entity:中心蛋白名称。 + - target_entity:相关实体名称。 + - relationship_summary:简要说明文中二者如何直接关联。 + + 格式:("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. 将步骤1–2的所有实体与关系合并为单列表,用**{record_delimiter}**分隔。 + +4. 输出结束标记{completion_delimiter} + +################ +-示例- +################ +文本: +################ +肿瘤抑制蛋白p53是一种转录因子,可响应DNA损伤。ATM激酶在第15位丝氨酸磷酸化p53,增强其稳定性。E3泛素连接酶MDM2通过泛素化负调控p53。 +################ +输出: +("entity"{tuple_delimiter}"p53"{tuple_delimiter}"protein"{tuple_delimiter}"肿瘤抑制转录因子,能感知DNA损伤并通过翻译后修饰被调控。"){record_delimiter} +("entity"{tuple_delimiter}"ATM激酶"{tuple_delimiter}"protein"{tuple_delimiter}"蛋白激酶,在丝氨酸-15位点磷酸化p53,从而提高其稳定性。"){record_delimiter} +("entity"{tuple_delimiter}"丝氨酸-15"{tuple_delimiter}"site"{tuple_delimiter}"p53上被ATM激酶靶向的磷酸化位点。"){record_delimiter} +("entity"{tuple_delimiter}"MDM2"{tuple_delimiter}"protein"{tuple_delimiter}"E3泛素连接酶,通过泛素化负调控p53。"){record_delimiter} +("entity"{tuple_delimiter}"DNA损伤"{tuple_delimiter}"concept"{tuple_delimiter}"细胞内应激信号,可激活p53介导的转录应答。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"ATM激酶"{tuple_delimiter}"ATM激酶磷酸化p53,增强其稳定性。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"丝氨酸-15"{tuple_delimiter}"p53在该位点被ATM激酶磷酸化。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"MDM2"{tuple_delimiter}"MDM2对p53进行泛素化,负向调控其活性。"){record_delimiter} +("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"DNA损伤"{tuple_delimiter}"p53作为感受器-转录因子响应DNA损伤。"){completion_delimiter} + +################ +-真实数据- +实体类型:{entity_types} +文本:{input_text} +################ +输出: +""" + + +PROTEIN_KG_EXTRACTION_PROMPT: dict = { + "en": TEMPLATE_EN, + "zh": TEMPLATE_ZH, + "FORMAT": { + "tuple_delimiter": "<|>", + "record_delimiter": "##", + "completion_delimiter": "<|COMPLETE|>", + "entity_types": "protein, gene, site, modification, pathway, disease, drug, organism, tissue, cell_line, " + "experiment, technology, concept, location, organization, person, mission, science", + }, +} diff --git a/graphgen/utils/loop.py b/graphgen/utils/loop.py index 5f12fa5b..f0ab7dfd 100644 --- a/graphgen/utils/loop.py +++ b/graphgen/utils/loop.py @@ -1,9 +1,10 @@ import asyncio +from typing import Tuple from .log import logger -def create_event_loop() -> asyncio.AbstractEventLoop: +def create_event_loop() -> Tuple[asyncio.AbstractEventLoop, bool]: """ Ensure that there is always an event loop available. @@ -11,18 +12,25 @@ def create_event_loop() -> asyncio.AbstractEventLoop: it creates a new event loop and sets it as the current event loop. Returns: - asyncio.AbstractEventLoop: The current or newly created event loop. + Tuple[asyncio.AbstractEventLoop, bool]: The event loop and a flag + indicating if we created it (True) or it was already running (False). """ try: - # Try to get the current event loop - current_loop = asyncio.get_event_loop() - if current_loop.is_closed(): - raise RuntimeError("Event loop is closed.") - return current_loop - + # Try to get the running event loop (Python 3.7+) + running_loop = asyncio.get_running_loop() + # If we get here, there's already a running loop + return running_loop, False except RuntimeError: - # If no event loop exists or it is closed, create a new one - logger.info("Creating a new event loop in main thread.") - new_loop = asyncio.new_event_loop() - asyncio.set_event_loop(new_loop) - return new_loop + # No running loop, try to get the current event loop + try: + current_loop = asyncio.get_event_loop() + if current_loop.is_closed(): + raise RuntimeError("Event loop is closed.") from None + # Loop exists but not running, we can use it + return current_loop, False + except RuntimeError: + # No event loop exists, create a new one + logger.info("Creating a new event loop in main thread.") + new_loop = asyncio.new_event_loop() + asyncio.set_event_loop(new_loop) + return new_loop, True From b6e65c07c92216709cab7cd56758599070d921a1 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Thu, 18 Dec 2025 18:51:52 +0800 Subject: [PATCH 04/20] fix: update local blast database paths in omics qa config --- examples/generate/generate_omics_qa/omics_qa_config.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/generate/generate_omics_qa/omics_qa_config.yaml b/examples/generate/generate_omics_qa/omics_qa_config.yaml index 22b9b26c..8f7966ad 100644 --- a/examples/generate/generate_omics_qa/omics_qa_config.yaml +++ b/examples/generate/generate_omics_qa/omics_qa_config.yaml @@ -30,20 +30,19 @@ nodes: email: your_email@example.com # Required for NCBI tool: GraphGen use_local_blast: true - local_blast_db: databases/refseq_232_old/refseq_232 + local_blast_db: path_to_your_local_blast_db/refseq_version/refseq_version blast_num_threads: 2 max_concurrent: 5 # RNA search parameters rnacentral_params: use_local_blast: true - local_blast_db: databases/rnacentral_merged_20251213/rnacentral_merged_20251213 + local_blast_db: path_to_your_local_blast_db/rnacentral_YYYYMMDD/rnacentral_YYYYMMDD blast_num_threads: 2 max_concurrent: 5 # Protein search parameters uniprot_params: use_local_blast: true - # local_blast_db: ${RELEASE}/uniprot_sprot - local_blast_db: databases/2025_04/uniprot_sprot + local_blast_db: path_to_your_local_blast_db/${RELEASE}/uniprot_sprot blast_num_threads: 2 max_concurrent: 5 From 29ab42fb83f5aa94e6b13cea25f7d71989f94d95 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Thu, 18 Dec 2025 19:03:26 +0800 Subject: [PATCH 05/20] fix: fix pylint problems --- .../models/generator/omics_qa_generator.py | 4 +-- graphgen/models/kg_builder/__init__.py | 2 +- .../partitioner/anchor_bfs_partitioner.py | 2 +- .../models/searcher/db/rnacentral_searcher.py | 2 +- .../models/searcher/db/uniprot_searcher.py | 2 +- .../operators/partition/partition_service.py | 2 +- graphgen/operators/search/search_service.py | 28 +++++++++---------- graphgen/templates/kg/__init__.py | 2 +- 8 files changed, 22 insertions(+), 22 deletions(-) diff --git a/graphgen/models/generator/omics_qa_generator.py b/graphgen/models/generator/omics_qa_generator.py index d5ac7ddb..76bd8c7b 100644 --- a/graphgen/models/generator/omics_qa_generator.py +++ b/graphgen/models/generator/omics_qa_generator.py @@ -68,7 +68,7 @@ def parse_response(response: str) -> Any: return qa_pairs @staticmethod - def _extract_caption(node_data: dict, molecule_type: str) -> Optional[dict]: + def _extract_caption(node_data: dict, molecule_type: str) -> Optional[dict]: # pylint: disable=too-many-branches """ Extract molecule-specific caption information from node data. @@ -341,7 +341,7 @@ def format_generation_results( } for qa in qa_items ] - if output_data_format == "ChatML": + elif output_data_format == "ChatML": return [ { "messages": [ diff --git a/graphgen/models/kg_builder/__init__.py b/graphgen/models/kg_builder/__init__.py index 3dc4ae95..aa0339c6 100644 --- a/graphgen/models/kg_builder/__init__.py +++ b/graphgen/models/kg_builder/__init__.py @@ -1,3 +1,3 @@ from .light_rag_kg_builder import LightRAGKGBuilder from .mm_kg_builder import MMKGBuilder -from .omics_kg_builder import OmicsKGBuilder \ No newline at end of file +from .omics_kg_builder import OmicsKGBuilder diff --git a/graphgen/models/partitioner/anchor_bfs_partitioner.py b/graphgen/models/partitioner/anchor_bfs_partitioner.py index 1e5e1903..f94fe6df 100644 --- a/graphgen/models/partitioner/anchor_bfs_partitioner.py +++ b/graphgen/models/partitioner/anchor_bfs_partitioner.py @@ -66,7 +66,7 @@ def partition( if comm_n or comm_e: yield Community(id=seed_node, nodes=comm_n, edges=comm_e) - def _pick_anchor_ids( + def _pick_anchor_ids( # pylint: disable=too-many-branches self, nodes: List[tuple[str, dict]], ) -> Set[str]: diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index d0a27efe..2d5ef138 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -276,7 +276,7 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]: pass return None - def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: + def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: # pylint: disable=too-many-return-statements """ Search RNAcentral with an RNA sequence. Tries local BLAST first if enabled, falls back to RNAcentral API. diff --git a/graphgen/models/searcher/db/uniprot_searcher.py b/graphgen/models/searcher/db/uniprot_searcher.py index ca32ff0e..22181d05 100644 --- a/graphgen/models/searcher/db/uniprot_searcher.py +++ b/graphgen/models/searcher/db/uniprot_searcher.py @@ -111,7 +111,7 @@ def get_best_hit(self, keyword: str) -> Optional[Dict]: self.logger.error("Keyword %s not found: %s", keyword, e) return None - def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: + def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: # pylint: disable=too-many-return-statements """ Search UniProt with a FASTA sequence and return the best hit. :param fasta_sequence: The FASTA sequence. diff --git a/graphgen/operators/partition/partition_service.py b/graphgen/operators/partition/partition_service.py index dbb52b92..603577fc 100644 --- a/graphgen/operators/partition/partition_service.py +++ b/graphgen/operators/partition/partition_service.py @@ -127,7 +127,7 @@ def _pre_tokenize(self) -> None: self.kg_instance.index_done_callback() logger.info("Pre-tokenization completed.") - def _attach_additional_data_to_node(self, batch: tuple) -> tuple: + def _attach_additional_data_to_node(self, batch: tuple) -> tuple: # pylint: disable=too-many-branches,too-many-statements """ Attach additional data from chunk_storage to nodes in the batch. :param batch: tuple of (nodes_data, edges_data) diff --git a/graphgen/operators/search/search_service.py b/graphgen/operators/search/search_service.py index 60cbf42d..7cc53dad 100644 --- a/graphgen/operators/search/search_service.py +++ b/graphgen/operators/search/search_service.py @@ -215,7 +215,18 @@ def _is_already_searched(self, doc: dict) -> bool: return False - def _normalize_searched_data(self, doc: dict) -> dict: + @staticmethod + def _clean_value(v): + """Recursively convert numpy arrays and other problematic types to Python-native types.""" + if isinstance(v, np.ndarray): + return v.tolist() + if isinstance(v, (list, tuple)): + return [SearchService._clean_value(item) for item in v] + if isinstance(v, dict): + return {k: SearchService._clean_value(val) for k, val in v.items()} + return v + + def _normalize_searched_data(self, doc: dict) -> dict: # pylint: disable=too-many-branches """ Normalize a document that already contains search results to the expected format. @@ -289,7 +300,7 @@ def _normalize_searched_data(self, doc: dict) -> dict: return normalized_doc - def process(self, batch: pd.DataFrame) -> pd.DataFrame: + def process(self, batch: pd.DataFrame) -> pd.DataFrame: # pylint: disable=too-many-branches """ Process a batch of documents and perform searches. This is the Ray Data operator interface. @@ -397,18 +408,7 @@ def process(self, batch: pd.DataFrame) -> pd.DataFrame: # Convert numpy arrays and complex types to Python-native types # to avoid Ray Data tensor extension casting issues - def clean_value(v): - """Recursively convert numpy arrays and other problematic types to Python-native types.""" - if isinstance(v, np.ndarray): - return v.tolist() - elif isinstance(v, (list, tuple)): - return [clean_value(item) for item in v] - elif isinstance(v, dict): - return {k: clean_value(val) for k, val in v.items()} - else: - return v - - cleaned_result = {k: clean_value(v) for k, v in result.items()} + cleaned_result = {k: self._clean_value(v) for k, v in result.items()} # Create document row with all result fields plus required fields row = { diff --git a/graphgen/templates/kg/__init__.py b/graphgen/templates/kg/__init__.py index efc2ca31..e39c1408 100644 --- a/graphgen/templates/kg/__init__.py +++ b/graphgen/templates/kg/__init__.py @@ -2,4 +2,4 @@ from .kg_summarization import KG_SUMMARIZATION_PROMPT from .mm_kg_extraction import MMKG_EXTRACTION_PROMPT from .omics_kg_extraction import OMICS_KG_EXTRACTION_PROMPT -from .protein_kg_extraction import PROTEIN_KG_EXTRACTION_PROMPT \ No newline at end of file +from .protein_kg_extraction import PROTEIN_KG_EXTRACTION_PROMPT From 8b908b82075d2a8ac097d359fafa02131e86c6c6 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Thu, 18 Dec 2025 19:07:49 +0800 Subject: [PATCH 06/20] fix: fix pylint problems agaaaain --- graphgen/models/generator/omics_qa_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphgen/models/generator/omics_qa_generator.py b/graphgen/models/generator/omics_qa_generator.py index 76bd8c7b..26e8d6ca 100644 --- a/graphgen/models/generator/omics_qa_generator.py +++ b/graphgen/models/generator/omics_qa_generator.py @@ -341,7 +341,7 @@ def format_generation_results( } for qa in qa_items ] - elif output_data_format == "ChatML": + if output_data_format == "ChatML": return [ { "messages": [ From 41730955186947d9389f8e84f25c38208a86c781 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Thu, 18 Dec 2025 19:12:09 +0800 Subject: [PATCH 07/20] fix: fix pylint problems again --- graphgen/models/generator/omics_qa_generator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/graphgen/models/generator/omics_qa_generator.py b/graphgen/models/generator/omics_qa_generator.py index 26e8d6ca..1c8e1112 100644 --- a/graphgen/models/generator/omics_qa_generator.py +++ b/graphgen/models/generator/omics_qa_generator.py @@ -361,5 +361,4 @@ def format_generation_results( } for qa in qa_items ] - else: - raise ValueError(f"Unknown output data format: {output_data_format}") + raise ValueError(f"Unknown output data format: {output_data_format}") From 6466e27c30407e2ad1ba197ac0e382a46b4dca11 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Thu, 18 Dec 2025 19:26:57 +0800 Subject: [PATCH 08/20] chore: remove protein KG extraction template --- .../templates/kg/protein_kg_extraction.py | 144 ------------------ 1 file changed, 144 deletions(-) delete mode 100644 graphgen/templates/kg/protein_kg_extraction.py diff --git a/graphgen/templates/kg/protein_kg_extraction.py b/graphgen/templates/kg/protein_kg_extraction.py deleted file mode 100644 index 16e9c868..00000000 --- a/graphgen/templates/kg/protein_kg_extraction.py +++ /dev/null @@ -1,144 +0,0 @@ -# pylint: disable=C0301 -TEMPLATE_EN: str = """You are an expert in protein science and knowledge-graph construction. -Your task is to extract a star-shaped knowledge graph centered on **a single protein** mentioned in the given text. - --Goal- -Given free-text that discusses one or more proteins, identify: -1. The **central protein** (the first-mentioned protein or the protein explicitly indicated by the user). -2. All entities that are **directly related** to this central protein. -3. All relationships that **directly link** those entities to the central protein (star edges). - -Use English as the output language. Please provide only the result without any extra explanations. - --Steps- -1. Identify the **central protein entity** and all **directly-related entities** from the text. - For the **central protein**, extract: - - entity_name: use the full name or UniProt ID if given; capitalized. - - entity_type: always `protein`. - - entity_summary: concise description of its main biological role, location, or significance in the text. - - For each **directly-related entity**, extract: - - entity_name: capitalized. - - entity_type: one of [{entity_types}]. - - entity_summary: comprehensive summary of its attributes/activities **as stated in the text**. - - Format each entity as - ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) - -2. From the entities found in Step 1, list every **(central protein → related entity)** pair that is **clearly related**. - For each pair extract: - - source_entity: the **central protein** name. - - target_entity: the related entity name. - - relationship_summary: short explanation of how the central protein is connected to this entity **according to the text**. - - Format each relationship as - ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) - -3. Output a single list of all entities and relationships from Steps 1–2, using **{record_delimiter}** as the delimiter. - -4. Finish by printing {completion_delimiter} - -################ --Example- -################ -Text: -################ -The tumor-suppressor protein p53 is a transcription factor that responds to DNA damage. -Phosphorylation of p53 by ATM kinase at serine-15 enhances its stability. -MDM2, an E3 ubiquitin ligase, negatively regulates p53 via ubiquitination. -################ -Output: -("entity"{tuple_delimiter}"p53"{tuple_delimiter}"protein"{tuple_delimiter}"Tumor-suppressor transcription factor that responds to DNA damage and is regulated by post-translational modifications."){record_delimiter} -("entity"{tuple_delimiter}"ATM kinase"{tuple_delimiter}"protein"{tuple_delimiter}"Protein kinase that phosphorylates p53 at serine-15, thereby enhancing p53 stability."){record_delimiter} -("entity"{tuple_delimiter}"serine-15"{tuple_delimiter}"site"{tuple_delimiter}"Phosphorylation site on p53 that is targeted by ATM kinase."){record_delimiter} -("entity"{tuple_delimiter}"MDM2"{tuple_delimiter}"protein"{tuple_delimiter}"E3 ubiquitin ligase that negatively regulates p53 through ubiquitination."){record_delimiter} -("entity"{tuple_delimiter}"DNA damage"{tuple_delimiter}"concept"{tuple_delimiter}"Cellular stress signal that activates p53-mediated transcriptional response."){record_delimiter} -("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"ATM kinase"{tuple_delimiter}"ATM kinase phosphorylates p53, enhancing its stability."){record_delimiter} -("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"serine-15"{tuple_delimiter}"p53 is phosphorylated at serine-15 by ATM kinase."){record_delimiter} -("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"MDM2"{tuple_delimiter}"MDM2 ubiquitinates p53, negatively regulating its activity."){record_delimiter} -("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"DNA damage"{tuple_delimiter}"p53 acts as a sensor-transcription factor in response to DNA damage."){completion_delimiter} - -################ --Real Data- -Entity_types: {entity_types} -Text: {input_text} -################ -Output: -""" - - -TEMPLATE_ZH: str = """您是蛋白质科学与知识图谱构建专家。 -任务:从给定文本中抽取以**一个中心蛋白质**为核心的星型知识图谱。 - --目标- -文本可能提及一个或多个蛋白质,请: -1. 确定**中心蛋白质**(文本首个提及或用户指定的蛋白)。 -2. 识别所有与中心蛋白**直接相关**的实体。 -3. 仅保留**中心蛋白→相关实体**的直接关系(星型边)。 - -使用中文输出。请直接给出结果,不要增加额外说明。 - --步骤- -1. 确定**中心蛋白质实体**及所有**直接相关实体**。 - 对于**中心蛋白质**: - - entity_name:全名或UniProt ID,首字母大写。 - - entity_type:固定为`protein`。 - - entity_summary:简述其在文中的生物学功能、定位或意义。 - - 对于每个**直接相关实体**: - - entity_name:首字母大写。 - - entity_type:可选类型[{entity_types}]。 - - entity_summary:全面总结其在文中与中心蛋白相关的属性/活动。 - - 格式:("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) - -2. 在步骤1的实体中,列出所有**(中心蛋白→相关实体)**的明显关系对。 - 每对提取: - - source_entity:中心蛋白名称。 - - target_entity:相关实体名称。 - - relationship_summary:简要说明文中二者如何直接关联。 - - 格式:("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) - -3. 将步骤1–2的所有实体与关系合并为单列表,用**{record_delimiter}**分隔。 - -4. 输出结束标记{completion_delimiter} - -################ --示例- -################ -文本: -################ -肿瘤抑制蛋白p53是一种转录因子,可响应DNA损伤。ATM激酶在第15位丝氨酸磷酸化p53,增强其稳定性。E3泛素连接酶MDM2通过泛素化负调控p53。 -################ -输出: -("entity"{tuple_delimiter}"p53"{tuple_delimiter}"protein"{tuple_delimiter}"肿瘤抑制转录因子,能感知DNA损伤并通过翻译后修饰被调控。"){record_delimiter} -("entity"{tuple_delimiter}"ATM激酶"{tuple_delimiter}"protein"{tuple_delimiter}"蛋白激酶,在丝氨酸-15位点磷酸化p53,从而提高其稳定性。"){record_delimiter} -("entity"{tuple_delimiter}"丝氨酸-15"{tuple_delimiter}"site"{tuple_delimiter}"p53上被ATM激酶靶向的磷酸化位点。"){record_delimiter} -("entity"{tuple_delimiter}"MDM2"{tuple_delimiter}"protein"{tuple_delimiter}"E3泛素连接酶,通过泛素化负调控p53。"){record_delimiter} -("entity"{tuple_delimiter}"DNA损伤"{tuple_delimiter}"concept"{tuple_delimiter}"细胞内应激信号,可激活p53介导的转录应答。"){record_delimiter} -("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"ATM激酶"{tuple_delimiter}"ATM激酶磷酸化p53,增强其稳定性。"){record_delimiter} -("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"丝氨酸-15"{tuple_delimiter}"p53在该位点被ATM激酶磷酸化。"){record_delimiter} -("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"MDM2"{tuple_delimiter}"MDM2对p53进行泛素化,负向调控其活性。"){record_delimiter} -("relationship"{tuple_delimiter}"p53"{tuple_delimiter}"DNA损伤"{tuple_delimiter}"p53作为感受器-转录因子响应DNA损伤。"){completion_delimiter} - -################ --真实数据- -实体类型:{entity_types} -文本:{input_text} -################ -输出: -""" - - -PROTEIN_KG_EXTRACTION_PROMPT: dict = { - "en": TEMPLATE_EN, - "zh": TEMPLATE_ZH, - "FORMAT": { - "tuple_delimiter": "<|>", - "record_delimiter": "##", - "completion_delimiter": "<|COMPLETE|>", - "entity_types": "protein, gene, site, modification, pathway, disease, drug, organism, tissue, cell_line, " - "experiment, technology, concept, location, organization, person, mission, science", - }, -} From 82665bf24cc6db820118ec47698cc6a9cb3b69d7 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Fri, 19 Dec 2025 01:12:06 +0800 Subject: [PATCH 09/20] refactor: remove unused read_stream method from JSONReader --- graphgen/models/reader/json_reader.py | 37 +-------------------------- 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/graphgen/models/reader/json_reader.py b/graphgen/models/reader/json_reader.py index a25abee8..abe00034 100644 --- a/graphgen/models/reader/json_reader.py +++ b/graphgen/models/reader/json_reader.py @@ -1,6 +1,6 @@ import json import os -from typing import Any, Dict, Iterator, List, Union +from typing import Any, Dict, List, Union import ray import ray.data @@ -44,41 +44,6 @@ def read(self, input_path: Union[str, List[str]]) -> ray.data.Dataset: ds = ds.filter(self._should_keep_item) return ds - def read_stream(self, file_path: str) -> Iterator[Dict[str, Any]]: - """ - Stream read JSONL files line by line without loading entire file into memory. - Returns an iterator that yields filtered documents. - - :param file_path: Path to the JSONL file. - :return: Iterator of dictionaries containing the data. - """ - if not file_path.endswith(".jsonl"): - raise ValueError("read_stream only supports JSONL files, not JSON files") - - with open(file_path, "r", encoding="utf-8") as f: - for line in f: - try: - doc = json.loads(line) - assert "type" in doc, f"Missing 'type' in document: {doc}" - if doc.get("type") == "text" and self.text_column not in doc: - raise ValueError( - f"Missing '{self.text_column}' in document: {doc}" - ) - - # Apply filtering logic inline (similar to BaseReader.filter) - if doc.get("type") == "text": - content = doc.get(self.text_column, "").strip() - if content: - yield doc - elif doc.get("type") in ("image", "table", "equation"): - img_path = doc.get("img_path") - if self._image_exists(img_path): - yield doc - else: - yield doc - except json.JSONDecodeError as e: - logger.error("Error decoding JSON line: %s. Error: %s", line, e) - @staticmethod def _image_exists(path_or_url: str, timeout: int = 3) -> bool: """ From 2a27f69ddebf674762d93a3de7c2ae2050a3d6b7 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Fri, 19 Dec 2025 01:31:28 +0800 Subject: [PATCH 10/20] refactor: remove repeated image_exists method from JSONReader --- graphgen/models/reader/json_reader.py | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/graphgen/models/reader/json_reader.py b/graphgen/models/reader/json_reader.py index abe00034..6752e042 100644 --- a/graphgen/models/reader/json_reader.py +++ b/graphgen/models/reader/json_reader.py @@ -1,12 +1,10 @@ import json -import os -from typing import Any, Dict, List, Union +from typing import List, Union import ray import ray.data from graphgen.bases.base_reader import BaseReader -from graphgen.utils import logger class JSONReader(BaseReader): @@ -44,27 +42,6 @@ def read(self, input_path: Union[str, List[str]]) -> ray.data.Dataset: ds = ds.filter(self._should_keep_item) return ds - @staticmethod - def _image_exists(path_or_url: str, timeout: int = 3) -> bool: - """ - Check if an image exists at the given local path or URL. - :param path_or_url: Local file path or remote URL of the image. - :param timeout: Timeout for remote URL requests in seconds. - :return: True if the image exists, False otherwise. - """ - if not path_or_url: - return False - if not path_or_url.startswith(("http://", "https://", "ftp://")): - path = path_or_url.replace("file://", "", 1) - path = os.path.abspath(path) - return os.path.isfile(path) - try: - import requests - resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout) - return resp.status_code == 200 - except Exception: - return False - @staticmethod def _unify_schema(data): """ From 91abfde82d03333e361683cbcd2471517a6b076a Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Fri, 19 Dec 2025 11:41:21 +0800 Subject: [PATCH 11/20] refactor: clean up logging in Engine --- graphgen/engine.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/graphgen/engine.py b/graphgen/engine.py index 6d6961d5..4ef13e97 100644 --- a/graphgen/engine.py +++ b/graphgen/engine.py @@ -1,13 +1,10 @@ import inspect -import logging -import os from collections import defaultdict, deque from functools import wraps from typing import Any, Callable, Dict, List, Set import ray import ray.data -from ray.data import DataContext from graphgen.bases import Config, Node from graphgen.utils import logger @@ -22,23 +19,9 @@ def __init__( self.functions = functions self.datasets: Dict[str, ray.data.Dataset] = {} - # Disable Ray Data progress bars and verbose output - os.environ.setdefault("RAY_DATA_DISABLE_PROGRESS_BARS", "1") - # Disable metrics exporter to avoid RpcError - os.environ.setdefault("RAY_DISABLE_IMPORTANT_WARNING", "1") - ctx = DataContext.get_current() - ctx.enable_rich_progress_bars = False - ctx.use_ray_tqdm = False - # Disable tensor extension casting to avoid conversion errors with complex types - # (e.g., gene_synonyms, gene_names which are lists/arrays) - ctx.enable_tensor_extension_casting = False - if not ray.is_initialized(): - # Disable metrics exporter to avoid RpcError - ray_init_kwargs.setdefault("_metrics_export_port", 0) context = ray.init( ignore_reinit_error=True, - logging_level=logging.ERROR, log_to_driver=True, **ray_init_kwargs, ) From f2ee12f2028998067c029bcd96e4bf86756397c9 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Fri, 19 Dec 2025 14:00:16 +0800 Subject: [PATCH 12/20] refactor: enhance initialization of services with configurable backends --- graphgen/operators/chunk/chunk_service.py | 6 ++++-- .../operators/partition/partition_service.py | 19 ++++++++++--------- graphgen/operators/quiz/quiz_service.py | 6 ++++-- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/graphgen/operators/chunk/chunk_service.py b/graphgen/operators/chunk/chunk_service.py index b6775764..102c74fd 100644 --- a/graphgen/operators/chunk/chunk_service.py +++ b/graphgen/operators/chunk/chunk_service.py @@ -42,12 +42,14 @@ def split_chunks(text: str, language: str = "en", **kwargs) -> list: class ChunkService(BaseOperator): - def __init__(self, working_dir: str = "cache", **chunk_kwargs): + def __init__( + self, working_dir: str = "cache", kv_backend: str = "rocksdb", **chunk_kwargs + ): super().__init__(working_dir=working_dir, op_name="chunk_service") tokenizer_model = os.getenv("TOKENIZER_MODEL", "cl100k_base") self.tokenizer_instance: Tokenizer = Tokenizer(model_name=tokenizer_model) self.chunk_storage = init_storage( - backend="rocksdb", + backend=kv_backend, working_dir=working_dir, namespace="chunk", ) diff --git a/graphgen/operators/partition/partition_service.py b/graphgen/operators/partition/partition_service.py index 603577fc..f7aae20a 100644 --- a/graphgen/operators/partition/partition_service.py +++ b/graphgen/operators/partition/partition_service.py @@ -19,15 +19,21 @@ class PartitionService(BaseOperator): - def __init__(self, working_dir: str = "cache", **partition_kwargs): + def __init__( + self, + working_dir: str = "cache", + graph_backend: str = "kuzu", + kv_backend: str = "rocksdb", + **partition_kwargs, + ): super().__init__(working_dir=working_dir, op_name="partition_service") self.kg_instance: BaseGraphStorage = init_storage( - backend="kuzu", + backend=graph_backend, working_dir=working_dir, namespace="graph", ) self.chunk_storage: BaseKVStorage = init_storage( - backend="rocksdb", + backend=kv_backend, working_dir=working_dir, namespace="chunk", ) @@ -65,13 +71,8 @@ def partition(self) -> Iterable[pd.DataFrame]: partitioner = LeidenPartitioner() elif method == "anchor_bfs": logger.info("Partitioning knowledge graph using Anchor BFS method.") - anchor_type = method_params.get("anchor_type") - if isinstance(anchor_type, list): - logger.info("Using multiple anchor types: %s", anchor_type) - else: - logger.info("Using single anchor type: %s", anchor_type) partitioner = AnchorBFSPartitioner( - anchor_type=anchor_type, + anchor_type=method_params.get("anchor_type"), anchor_ids=set(method_params.get("anchor_ids", [])) if method_params.get("anchor_ids") else None, diff --git a/graphgen/operators/quiz/quiz_service.py b/graphgen/operators/quiz/quiz_service.py index 66dc1193..a6aeb7be 100644 --- a/graphgen/operators/quiz/quiz_service.py +++ b/graphgen/operators/quiz/quiz_service.py @@ -12,6 +12,8 @@ class QuizService(BaseOperator): def __init__( self, working_dir: str = "cache", + graph_backend: str = "kuzu", + kv_backend: str = "rocksdb", quiz_samples: int = 1, concurrency_limit: int = 200, ): @@ -19,11 +21,11 @@ def __init__( self.quiz_samples = quiz_samples self.llm_client: BaseLLMWrapper = init_llm("synthesizer") self.graph_storage: BaseGraphStorage = init_storage( - backend="networkx", working_dir=working_dir, namespace="graph" + backend=graph_backend, working_dir=working_dir, namespace="graph" ) # { _quiz_id: { "description": str, "quizzes": List[Tuple[str, str]] } } self.quiz_storage: BaseKVStorage = init_storage( - backend="rocksdb", working_dir=working_dir, namespace="quiz" + backend=kv_backend, working_dir=working_dir, namespace="quiz" ) self.generator = QuizGenerator(self.llm_client) self.concurrency_limit = concurrency_limit From b21f4ce3733e031a07d815fabd9d861da57cea24 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Fri, 19 Dec 2025 14:02:15 +0800 Subject: [PATCH 13/20] refactor: remove unused progress bar from run_concurrent --- graphgen/utils/run_concurrent.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/graphgen/utils/run_concurrent.py b/graphgen/utils/run_concurrent.py index 8315b953..dfbfde32 100644 --- a/graphgen/utils/run_concurrent.py +++ b/graphgen/utils/run_concurrent.py @@ -1,7 +1,6 @@ import asyncio from typing import Awaitable, Callable, List, Optional, TypeVar, Union -import gradio as gr from tqdm.asyncio import tqdm as tqdm_async from graphgen.utils.log import logger @@ -18,7 +17,6 @@ def run_concurrent( *, desc: str = "processing", unit: str = "item", - progress_bar: Optional[gr.Progress] = None, save_interval: int = 0, save_callback: Optional[Callable[[List[R], int], None]] = None, max_concurrent: Optional[int] = None, @@ -71,10 +69,6 @@ async def run_with_semaphore(item: T) -> R: completed_count += 1 pbar.update(1) - if progress_bar is not None: - progress = completed_count / len(items) - progress_bar(progress, desc=f"{desc} ({completed_count}/{len(items)})") - # Periodic save if save_interval > 0 and save_callback is not None and completed_count % save_interval == 0: try: @@ -88,9 +82,6 @@ async def run_with_semaphore(item: T) -> R: pbar.close() - if progress_bar is not None: - progress_bar(1.0, desc=f"{desc} (completed)") - # Save remaining results if any if save_interval > 0 and save_callback is not None and pending_save_results: try: From 23fa2bb12659bf2be7f24b4ca8e5cea5ca774e2c Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Fri, 19 Dec 2025 14:33:59 +0800 Subject: [PATCH 14/20] refactor: simplify anchor_type initialization in AnchorBFSPartitioner --- graphgen/models/partitioner/anchor_bfs_partitioner.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/graphgen/models/partitioner/anchor_bfs_partitioner.py b/graphgen/models/partitioner/anchor_bfs_partitioner.py index f94fe6df..62c9a0db 100644 --- a/graphgen/models/partitioner/anchor_bfs_partitioner.py +++ b/graphgen/models/partitioner/anchor_bfs_partitioner.py @@ -1,6 +1,6 @@ import random from collections import deque -from typing import Any, Iterable, List, Literal, Set, Tuple, Union +from typing import Any, Iterable, List, Set, Tuple from graphgen.bases import BaseGraphStorage from graphgen.bases.datatypes import Community @@ -22,10 +22,7 @@ class AnchorBFSPartitioner(BFSPartitioner): def __init__( self, - anchor_type: Union[ - Literal["image", "dna", "rna", "protein"], - List[Literal["dna", "rna", "protein"]], - ] = "image", + anchor_type: list = ["image"], anchor_ids: Set[str] | None = None, ) -> None: super().__init__() From 41456e856bddc06aebae56e9abe6a17fd5354b7b Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Fri, 19 Dec 2025 14:41:22 +0800 Subject: [PATCH 15/20] style: pylint problems in AnchorBFSPartitioner --- graphgen/models/partitioner/anchor_bfs_partitioner.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/graphgen/models/partitioner/anchor_bfs_partitioner.py b/graphgen/models/partitioner/anchor_bfs_partitioner.py index 62c9a0db..b76e3ab0 100644 --- a/graphgen/models/partitioner/anchor_bfs_partitioner.py +++ b/graphgen/models/partitioner/anchor_bfs_partitioner.py @@ -22,11 +22,12 @@ class AnchorBFSPartitioner(BFSPartitioner): def __init__( self, - anchor_type: list = ["image"], + anchor_type: list | None = None, anchor_ids: Set[str] | None = None, ) -> None: super().__init__() - # Normalize anchor_type to always be a list for internal processing + if anchor_type is None: + anchor_type = ["image"] if isinstance(anchor_type, str): self.anchor_types = [anchor_type] else: From 610c48d337efe17c6b0798777af4dd5234b81545 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Fri, 19 Dec 2025 15:37:10 +0800 Subject: [PATCH 16/20] refactor: refactor search and db build scripts for DNA, RNA, and protein --- examples/search/search_dna.sh | 4 - examples/search/search_dna/README.md | 84 +++++++++++++++++++ .../build_db.sh} | 0 examples/search/search_dna/search_dna.sh | 4 + .../{ => search_dna}/search_dna_config.yaml | 0 examples/search/search_protein/README.md | 80 ++++++++++++++++++ .../build_db.sh} | 0 .../search_protein_config.yaml | 0 .../search/search_protein/search_uniprot.sh | 3 + examples/search/search_rna.sh | 4 - examples/search/search_rna/README.md | 80 ++++++++++++++++++ .../build_db.sh} | 0 examples/search/search_rna/search_rna.sh | 4 + .../{ => search_rna}/search_rna_config.yaml | 0 examples/search/search_uniprot.sh | 3 - 15 files changed, 255 insertions(+), 11 deletions(-) delete mode 100644 examples/search/search_dna.sh create mode 100644 examples/search/search_dna/README.md rename examples/search/{build_db/build_dna_blast_db.sh => search_dna/build_db.sh} (100%) create mode 100644 examples/search/search_dna/search_dna.sh rename examples/search/{ => search_dna}/search_dna_config.yaml (100%) create mode 100644 examples/search/search_protein/README.md rename examples/search/{build_db/build_protein_blast_db.sh => search_protein/build_db.sh} (100%) rename examples/search/{ => search_protein}/search_protein_config.yaml (100%) create mode 100644 examples/search/search_protein/search_uniprot.sh delete mode 100644 examples/search/search_rna.sh create mode 100644 examples/search/search_rna/README.md rename examples/search/{build_db/build_rna_blast_db.sh => search_rna/build_db.sh} (100%) create mode 100644 examples/search/search_rna/search_rna.sh rename examples/search/{ => search_rna}/search_rna_config.yaml (100%) delete mode 100644 examples/search/search_uniprot.sh diff --git a/examples/search/search_dna.sh b/examples/search/search_dna.sh deleted file mode 100644 index e05ab751..00000000 --- a/examples/search/search_dna.sh +++ /dev/null @@ -1,4 +0,0 @@ -python3 -m graphgen.run \ ---config_file examples/search/search_dna_config.yaml \ ---output_dir cache/ - diff --git a/examples/search/search_dna/README.md b/examples/search/search_dna/README.md new file mode 100644 index 00000000..f4e8be7a --- /dev/null +++ b/examples/search/search_dna/README.md @@ -0,0 +1,84 @@ +# Search DNA Sequences + +This example demonstrates how to search DNA sequences from NCBI RefSeq database using BLAST. + +## Overview + +The DNA search pipeline reads DNA sequence queries and searches against NCBI RefSeq database to find similar sequences and retrieve associated metadata. + +## Quick Start + +### 1. Build Local BLAST Database (Optional) + +If you want to use local BLAST for faster searches, first build the database: + +```bash +./build_db.sh [human_mouse_drosophila_yeast|representative|complete|all] +``` + +Options: +- `human_mouse_drosophila_yeast`: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae sequences (minimal, smallest) +- `representative`: Download genomic sequences from major categories (recommended, smaller) +- `complete`: Download all complete genomic sequences from complete/ directory (very large) +- `all`: Download all genomic sequences from all categories (very large) + +The script will create a BLAST database in `refseq_${RELEASE}/` directory. + +### 2. Configure Search Parameters + +Edit `search_dna_config.yaml` to set: + +- **Input file path**: Set the path to your DNA sequence queries +- **NCBI parameters**: + - `email`: Your email address (required by NCBI) + - `tool`: Tool name for NCBI API + - `use_local_blast`: Set to `true` if you have a local BLAST database + - `local_blast_db`: Path to your local BLAST database (without .nhr extension) + +Example configuration: +```yaml +input_path: + - examples/input_examples/search_dna_demo.jsonl + +data_sources: [ncbi] +ncbi_params: + email: your_email@example.com # Required! + tool: GraphGen + use_local_blast: true + local_blast_db: refseq_release/refseq_release +``` + +### 3. Run the Search + +```bash +./search_dna.sh +``` + +Or run directly with Python: + +```bash +python3 -m graphgen.run \ + --config_file examples/search/search_dna/search_dna_config.yaml \ + --output_dir cache/ +``` + +## Input Format + +The input file should be in JSONL format with DNA sequence queries: + +```jsonl +{"type": "text", "content": "BRCA1"} +{"type": "text", "content": ">query\nATGCGATCG..."} +{"type": "text", "content": "ATGCGATCG..."} +``` + +## Output + +The search results will be saved in the output directory with matched sequences and metadata from NCBI RefSeq. + +## Notes + +- **NCBI requires an email address** - Make sure to set `email` in `ncbi_params` +- **Local BLAST** provides faster searches and doesn't require internet connection during search +- The local BLAST database can be very large (several GB to TB depending on the download type) +- Adjust `max_concurrent` based on your system resources and API rate limits diff --git a/examples/search/build_db/build_dna_blast_db.sh b/examples/search/search_dna/build_db.sh similarity index 100% rename from examples/search/build_db/build_dna_blast_db.sh rename to examples/search/search_dna/build_db.sh diff --git a/examples/search/search_dna/search_dna.sh b/examples/search/search_dna/search_dna.sh new file mode 100644 index 00000000..67c6beaa --- /dev/null +++ b/examples/search/search_dna/search_dna.sh @@ -0,0 +1,4 @@ +python3 -m graphgen.run \ +--config_file examples/search/search_dna/search_dna_config.yaml \ +--output_dir cache/ + diff --git a/examples/search/search_dna_config.yaml b/examples/search/search_dna/search_dna_config.yaml similarity index 100% rename from examples/search/search_dna_config.yaml rename to examples/search/search_dna/search_dna_config.yaml diff --git a/examples/search/search_protein/README.md b/examples/search/search_protein/README.md new file mode 100644 index 00000000..650a8c69 --- /dev/null +++ b/examples/search/search_protein/README.md @@ -0,0 +1,80 @@ +# Search Protein Sequences + +This example demonstrates how to search protein sequences from UniProt database using BLAST. + +## Overview + +The protein search pipeline reads protein sequence queries and searches against UniProt database to find similar sequences and retrieve associated metadata. + +## Quick Start + +### 1. Build Local BLAST Database (Optional) + +If you want to use local BLAST for faster searches, first build the database: + +```bash +./build_db.sh +``` + +The script will download UniProt Swiss-Prot database and create a BLAST database. You can configure the download mode: +- `sprot` (default): Download only Swiss-Prot (high quality, curated) +- `full`: Download both Swiss-Prot and TrEMBL (complete database) + +The script will create a BLAST database in `${RELEASE}/` directory. + +### 2. Configure Search Parameters + +Edit `search_protein_config.yaml` to set: + +- **Input file path**: Set the path to your protein sequence queries +- **UniProt parameters**: + - `use_local_blast`: Set to `true` if you have a local BLAST database + - `local_blast_db`: Path to your local BLAST database (format: `/path/to/${RELEASE}/uniprot_sprot`) + +Example configuration: +```yaml +input_path: + - examples/input_examples/search_protein_demo.jsonl + +data_sources: [uniprot] +uniprot_params: + use_local_blast: true + local_blast_db: /your_path/2024_01/uniprot_sprot + # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database) +``` + +### 3. Run the Search + +```bash +./search_uniprot.sh +``` + +Or run directly with Python: + +```bash +python3 -m graphgen.run \ + --config_file examples/search/search_protein/search_protein_config.yaml \ + --output_dir cache/ +``` + +## Input Format + +The input file should be in JSONL format with protein sequence queries: + +```jsonl +{"type": "text", "content": "P01308"} +{"type": "text", "content": "insulin"} +{"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} +``` + +## Output + +The search results will be saved in the output directory with matched sequences and metadata from UniProt. + +## Notes + +- **Local BLAST** provides faster searches and doesn't require internet connection during search +- **Swiss-Prot** is recommended for high-quality, curated protein sequences +- **TrEMBL** contains automatically annotated sequences (larger database) +- The merged database (`uniprot_${RELEASE}`) contains both Swiss-Prot and TrEMBL +- Adjust `max_concurrent` based on your system resources and API rate limits diff --git a/examples/search/build_db/build_protein_blast_db.sh b/examples/search/search_protein/build_db.sh similarity index 100% rename from examples/search/build_db/build_protein_blast_db.sh rename to examples/search/search_protein/build_db.sh diff --git a/examples/search/search_protein_config.yaml b/examples/search/search_protein/search_protein_config.yaml similarity index 100% rename from examples/search/search_protein_config.yaml rename to examples/search/search_protein/search_protein_config.yaml diff --git a/examples/search/search_protein/search_uniprot.sh b/examples/search/search_protein/search_uniprot.sh new file mode 100644 index 00000000..1032dfa0 --- /dev/null +++ b/examples/search/search_protein/search_uniprot.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.run \ +--config_file examples/search/search_protein/search_protein_config.yaml \ +--output_dir cache/ diff --git a/examples/search/search_rna.sh b/examples/search/search_rna.sh deleted file mode 100644 index 2bf5a406..00000000 --- a/examples/search/search_rna.sh +++ /dev/null @@ -1,4 +0,0 @@ -python3 -m graphgen.run \ ---config_file examples/search/search_rna_config.yaml \ ---output_dir cache/ - diff --git a/examples/search/search_rna/README.md b/examples/search/search_rna/README.md new file mode 100644 index 00000000..2a59fed6 --- /dev/null +++ b/examples/search/search_rna/README.md @@ -0,0 +1,80 @@ +# Search RNA Sequences + +This example demonstrates how to search RNA sequences from RNAcentral database using BLAST. + +## Overview + +The RNA search pipeline reads RNA sequence queries and searches against RNAcentral database to find similar sequences and retrieve associated metadata. + +## Quick Start + +### 1. Build Local BLAST Database (Optional) + +If you want to use local BLAST for faster searches, first build the database: + +```bash +./build_db.sh [all|list|selected|database_name...] +``` + +Options: +- `all`: Download complete active database (~8.4G compressed) +- `list`: List all available database subsets +- `selected`: Download predefined database subsets (ensembl_gencode, mirbase, gtrnadb, refseq, lncbase, rfam) +- `database_name`: Download specific database subset (e.g., refseq, rfam, mirbase) + +The script will create a BLAST database in `rnacentral_${RELEASE}/` or `rnacentral_${DB_NAME}_${RELEASE}/` directory. + +### 2. Configure Search Parameters + +Edit `search_rna_config.yaml` to set: + +- **Input file path**: Set the path to your RNA sequence queries +- **RNAcentral parameters**: + - `use_local_blast`: Set to `true` if you have a local BLAST database + - `local_blast_db`: Path to your local BLAST database (without .nhr extension) + +Example configuration: +```yaml +input_path: + - examples/input_examples/search_rna_demo.jsonl + +data_sources: [rnacentral] +rnacentral_params: + use_local_blast: true + local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD +``` + +### 3. Run the Search + +```bash +./search_rna.sh +``` + +Or run directly with Python: + +```bash +python3 -m graphgen.run \ + --config_file examples/search/search_rna/search_rna_config.yaml \ + --output_dir cache/ +``` + +## Input Format + +The input file should be in JSONL format with RNA sequence queries: + +```jsonl +{"type": "text", "content": "miR-21"} +{"type": "text", "content": ">query\nAUGCAUGC..."} +{"type": "text", "content": "AUGCAUGC..."} +``` + +## Output + +The search results will be saved in the output directory with matched sequences and metadata from RNAcentral. + +## Notes + +- **Local BLAST** provides faster searches and doesn't require internet connection during search +- The complete RNAcentral database is large (~8.4G compressed), consider using specific database subsets for smaller downloads +- RNAcentral uses URS IDs (e.g., URS000149A9AF) which match the online RNAcentral API database +- Adjust `max_concurrent` based on your system resources and API rate limits diff --git a/examples/search/build_db/build_rna_blast_db.sh b/examples/search/search_rna/build_db.sh similarity index 100% rename from examples/search/build_db/build_rna_blast_db.sh rename to examples/search/search_rna/build_db.sh diff --git a/examples/search/search_rna/search_rna.sh b/examples/search/search_rna/search_rna.sh new file mode 100644 index 00000000..9735f90d --- /dev/null +++ b/examples/search/search_rna/search_rna.sh @@ -0,0 +1,4 @@ +python3 -m graphgen.run \ +--config_file examples/search/search_rna/search_rna_config.yaml \ +--output_dir cache/ + diff --git a/examples/search/search_rna_config.yaml b/examples/search/search_rna/search_rna_config.yaml similarity index 100% rename from examples/search/search_rna_config.yaml rename to examples/search/search_rna/search_rna_config.yaml diff --git a/examples/search/search_uniprot.sh b/examples/search/search_uniprot.sh deleted file mode 100644 index fbb33d70..00000000 --- a/examples/search/search_uniprot.sh +++ /dev/null @@ -1,3 +0,0 @@ -python3 -m graphgen.run \ ---config_file examples/search/search_protein_config.yaml \ ---output_dir cache/ From 8566fb475abab847025e8cf99ba44367b952f241 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Fri, 19 Dec 2025 16:14:18 +0800 Subject: [PATCH 17/20] refactor: remove unused async_to_sync_method --- graphgen/utils/__init__.py | 1 - graphgen/utils/wrap.py | 21 --------------------- 2 files changed, 22 deletions(-) delete mode 100644 graphgen/utils/wrap.py diff --git a/graphgen/utils/__init__.py b/graphgen/utils/__init__.py index ec118816..840b2cec 100644 --- a/graphgen/utils/__init__.py +++ b/graphgen/utils/__init__.py @@ -19,4 +19,3 @@ from .log import CURRENT_LOGGER_VAR, logger, set_logger from .loop import create_event_loop from .run_concurrent import run_concurrent -from .wrap import async_to_sync_method diff --git a/graphgen/utils/wrap.py b/graphgen/utils/wrap.py deleted file mode 100644 index 9689cea6..00000000 --- a/graphgen/utils/wrap.py +++ /dev/null @@ -1,21 +0,0 @@ -from functools import wraps -from typing import Any, Callable - -from .loop import create_event_loop - - -def async_to_sync_method(func: Callable) -> Callable: - @wraps(func) - def wrapper(self, *args, **kwargs) -> Any: - loop, created = create_event_loop() - try: - if loop.is_running(): - raise RuntimeError( - "Cannot use async_to_sync_method when event loop is already running." - ) - return loop.run_until_complete(func(self, *args, **kwargs)) - finally: - if created: - loop.close() - - return wrapper From 7dc02ff32cdcc529cec8c1d37163dc034f6482b7 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Fri, 19 Dec 2025 16:30:58 +0800 Subject: [PATCH 18/20] fix: update omics qa generation template to avoid repetition --- .../generation/omics_qa_generation.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/graphgen/templates/generation/omics_qa_generation.py b/graphgen/templates/generation/omics_qa_generation.py index 1d356e62..71f86b70 100644 --- a/graphgen/templates/generation/omics_qa_generation.py +++ b/graphgen/templates/generation/omics_qa_generation.py @@ -37,12 +37,14 @@ {relationships} ################ Directly output the generated QA pairs below. Do NOT copy any example questions, and do NOT include extraneous text. +IMPORTANT: Generate actual questions and answers, NOT placeholders. Do NOT include angle brackets or placeholder text like or . +Simply output your generated questions and answers in the following format: -Question: -Answer: +Question: [Your actual question here] +Answer: [Your actual answer here] -Question: -Answer: +Question: [Your actual question here] +Answer: [Your actual answer here] """ @@ -84,12 +86,14 @@ {relationships} ################ 请直接在下方输出生成的问答对,不要复制任何示例,不要输出无关内容。 +重要提示:请生成实际的问题和答案,不要使用占位符。不要包含尖括号或占位符文本(如 <问题1> 或 <答案1>)。 +请直接按照以下格式输出生成的问题和答案: -问题: <问题1> -答案: <答案1> +问题: [你生成的实际问题] +答案: [你生成的实际答案] -问题: <问题2> -答案: <答案2> +问题: [你生成的实际问题] +答案: [你生成的实际答案] """ From f29ab7928adc835d4a92ee399450e176fc6c6e09 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Wed, 24 Dec 2025 14:43:13 +0800 Subject: [PATCH 19/20] refactor: remove output_dir argument from multi-omics and search scripts --- examples/generate/generate_omics_qa/generate_omics_qa.sh | 3 +-- .../generate/generate_omics_qa/generate_omics_qa_searched.sh | 3 +-- examples/search/search_dna/search_dna.sh | 3 +-- examples/search/search_protein/search_uniprot.sh | 3 +-- examples/search/search_rna/search_rna.sh | 3 +-- 5 files changed, 5 insertions(+), 10 deletions(-) diff --git a/examples/generate/generate_omics_qa/generate_omics_qa.sh b/examples/generate/generate_omics_qa/generate_omics_qa.sh index 3afb129e..0f1420f4 100755 --- a/examples/generate/generate_omics_qa/generate_omics_qa.sh +++ b/examples/generate/generate_omics_qa/generate_omics_qa.sh @@ -1,3 +1,2 @@ python3 -m graphgen.run \ - --config_file examples/generate/generate_omics_qa/omics_qa_config.yaml \ - --output_dir cache/ + --config_file examples/generate/generate_omics_qa/omics_qa_config.yaml diff --git a/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh b/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh index 20b0b533..ec178889 100755 --- a/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh +++ b/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh @@ -1,3 +1,2 @@ python3 -m graphgen.run \ - --config_file examples/generate/generate_omics_qa/omics_qa_config_searched.yaml \ - --output_dir cache/ + --config_file examples/generate/generate_omics_qa/omics_qa_config_searched.yaml diff --git a/examples/search/search_dna/search_dna.sh b/examples/search/search_dna/search_dna.sh index 67c6beaa..ef51281d 100644 --- a/examples/search/search_dna/search_dna.sh +++ b/examples/search/search_dna/search_dna.sh @@ -1,4 +1,3 @@ python3 -m graphgen.run \ ---config_file examples/search/search_dna/search_dna_config.yaml \ ---output_dir cache/ +--config_file examples/search/search_dna/search_dna_config.yaml diff --git a/examples/search/search_protein/search_uniprot.sh b/examples/search/search_protein/search_uniprot.sh index 1032dfa0..627735a0 100644 --- a/examples/search/search_protein/search_uniprot.sh +++ b/examples/search/search_protein/search_uniprot.sh @@ -1,3 +1,2 @@ python3 -m graphgen.run \ ---config_file examples/search/search_protein/search_protein_config.yaml \ ---output_dir cache/ +--config_file examples/search/search_protein/search_protein_config.yaml diff --git a/examples/search/search_rna/search_rna.sh b/examples/search/search_rna/search_rna.sh index 9735f90d..04206c17 100644 --- a/examples/search/search_rna/search_rna.sh +++ b/examples/search/search_rna/search_rna.sh @@ -1,4 +1,3 @@ python3 -m graphgen.run \ ---config_file examples/search/search_rna/search_rna_config.yaml \ ---output_dir cache/ +--config_file examples/search/search_rna/search_rna_config.yaml From 6cc75e8b631630a0e0866e37f2d3afa12005b637 Mon Sep 17 00:00:00 2001 From: CHERRY-ui8 <2693275288@qq.com> Date: Wed, 24 Dec 2025 16:53:31 +0800 Subject: [PATCH 20/20] refactor: remove unused multi_omics_search.py --- .../operators/search/multi_omics_search.py | 29 ------------------- 1 file changed, 29 deletions(-) delete mode 100644 graphgen/operators/search/multi_omics_search.py diff --git a/graphgen/operators/search/multi_omics_search.py b/graphgen/operators/search/multi_omics_search.py deleted file mode 100644 index fbe10f06..00000000 --- a/graphgen/operators/search/multi_omics_search.py +++ /dev/null @@ -1,29 +0,0 @@ -import re -from typing import Dict, Optional - -from graphgen.models import UniProtSearch - - -def _fetch_uniprot(entry: str) -> Optional[Dict]: - entry = entry.strip() - client = UniProtSearch() - - # 1. first try accession search - if re.fullmatch( - r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}", entry - ): - return client.get_by_accession(entry) - - # 2. then try keyword search - return client.get_best_hit(entry) - - -def multi_omics_search(entry: str) -> Dict: - """ - Multi-omics search function that tries to fetch protein/gene information. - """ - # TODO: Extend this function to include more omics databases as needed. - result = _fetch_uniprot(entry) - if result: - return {"input": entry, "uniprot": result} - return {"input": entry, "uniprot": None}