diff --git a/examples/generate/generate_omics_qa/README.md b/examples/generate/generate_omics_qa/README.md new file mode 100644 index 00000000..9aca9561 --- /dev/null +++ b/examples/generate/generate_omics_qa/README.md @@ -0,0 +1,216 @@ +# Multi-omics Knowledge Graph QA Generation + +This example demonstrates how to build knowledge graphs from multi-omics data (DNA, RNA, protein) and generate question-answer pairs using the unified `omics_qa` method. + +## Pipeline Overview + +The pipeline includes the following steps: + +1. **read**: Read input files (JSON/JSONL format with sequence queries or protein data) +2. **search**: Search biological databases (NCBI for DNA, RNAcentral for RNA, UniProt for protein) - *optional if input already contains search results* +3. **chunk**: Chunk sequences and metadata +4. **build_kg**: Extract entities and relationships to build knowledge graph +5. **partition**: Partition the knowledge graph into communities using anchor-based BFS +6. **generate**: Generate QA pairs from partitioned communities with automatic molecule caption extraction + +## Key Features + +- **Unified QA Generation**: Single `omics_qa` method supports DNA, RNA, and Protein +- **Automatic Caption Extraction**: Automatically extracts and attaches molecule-specific information (dna/rna/protein captions) to each QA pair +- **Flexible Configuration**: Easy to switch between DNA, RNA, and Protein by changing input file and data source +- **Anchor-based Partitioning**: Uses molecule type as anchor for BFS partitioning (dna/rna/protein) + +## Quick Start + +### 1. Configure Input Data + +Edit `omics_qa_config.yaml` to set the input file path: + +**For DNA:** +```yaml +input_path: + - examples/input_examples/search_dna_demo.jsonl +``` + +**For RNA:** +```yaml +input_path: + - examples/input_examples/search_rna_demo.jsonl +``` + +**For Protein:** +```yaml +input_path: + - examples/input_examples/search_protein_demo.jsonl +``` + +### 2. Configure Data Source + +Set the appropriate data source and parameters in the `search_data` node: + +**For DNA (NCBI):** +```yaml +data_sources: [ncbi] +ncbi_params: + email: your_email@example.com # Required! + tool: GraphGen + use_local_blast: true + local_blast_db: refseq_release/refseq_release + blast_num_threads: 2 + max_concurrent: 5 +``` + +**For RNA (RNAcentral):** +```yaml +data_sources: [rnacentral] +rnacentral_params: + use_local_blast: true + local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD + blast_num_threads: 2 + max_concurrent: 5 +``` + +**For Protein (UniProt):** +```yaml +data_sources: [uniprot] +uniprot_params: + use_local_blast: true + local_blast_db: ${RELEASE}/uniprot_sprot + blast_num_threads: 2 + max_concurrent: 5 +``` + +### 3. Configure Anchor Type + +Set the `anchor_type` in the `partition` node to match your molecule type: + +```yaml +partition: + params: + method: anchor_bfs + method_params: + anchor_type: protein # Change to "dna" or "rna" as needed + max_units_per_community: 10 +``` + +### 4. Run the Pipeline + +```bash +./generate_omics_qa.sh +``` + +Or run directly with Python: + +```bash +python3 -m graphgen.run \ + --config_file examples/generate/generate_omics_qa/omics_qa_config.yaml \ + --output_dir cache/ +``` + +## Input Format + +### For DNA/RNA (JSONL format): +```jsonl +{"type": "text", "content": "BRCA1"} +{"type": "text", "content": ">query\nATGCGATCG..."} +{"type": "text", "content": "ATGCGATCG..."} +``` + +### For Protein (JSONL format): +```jsonl +{"type": "text", "content": "P01308"} +{"type": "text", "content": "insulin"} +{"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} +``` + +## Output Format + +The `omics_qa` method automatically extracts and attaches molecule-specific captions to QA pairs: + +### Alpaca Format: +```json +{ + "instruction": "What is the function of this protein?", + "input": "", + "output": "The protein functions as...", + "dna": {...}, # DNA caption (if molecule_type is DNA) + "rna": {...}, # RNA caption (if molecule_type is RNA) + "protein": {...} # Protein caption (if molecule_type is protein) +} +``` + +### ChatML Format: +```json +{ + "messages": [ + { + "role": "user", + "content": [ + { + "text": "What is the function of this protein?", + "dna": {...}, + "rna": {...}, + "protein": {...} + } + ] + }, + { + "role": "assistant", + "content": "The protein functions as..." + } + ] +} +``` + +## Caption Information + +The generator automatically extracts relevant caption information based on molecule type: + +- **DNA**: gene_name, gene_description, organism, chromosome, genomic_location, function, gene_type, etc. +- **RNA**: rna_type, description, organism, related_genes, gene_name, so_term, modifications, etc. +- **Protein**: protein_name, gene_names, organism, function, sequence, entry_name, etc. + +## Configuration Options + +### Chunking Parameters +- `chunk_size`: Size for text metadata chunks (default: 1024) +- `chunk_overlap`: Overlap for text chunks (default: 100) +- `sequence_chunk_size`: Size for sequence chunks (default: 1000) +- `sequence_chunk_overlap`: Overlap for sequence chunks (default: 100) + +### Partition Parameters +- `method`: `anchor_bfs` (recommended for omics data) +- `anchor_type`: `dna`, `rna`, or `protein` (must match your data type) +- `max_units_per_community`: Maximum nodes and edges per community (default: 10) + +### Generation Parameters +- `method`: `omics_qa` (unified method for DNA/RNA/Protein) +- `data_format`: `Alpaca`, `ChatML`, or `Sharegpt` + +## Notes + +- **NCBI requires an email address** - Make sure to set `email` in `ncbi_params` +- **Anchor type must match molecule type** - Set `anchor_type` to match your data (dna/rna/protein) +- **Local BLAST** can be enabled if you have local databases set up (see `examples/search/build_db/`) +- **Caption extraction** is automatic - The generator detects molecule type and extracts relevant caption information +- Adjust `max_concurrent` based on your system resources and API rate limits + +## Examples + +### Generate QA for Protein Data +1. Set `input_path` to `examples/input_examples/search_protein_demo.jsonl` +2. Set `data_sources: [uniprot]` +3. Set `anchor_type: protein` +4. Run `./generate_omics_qa.sh` + +### Generate QA for DNA Data +1. Set `input_path` to `examples/input_examples/search_dna_demo.jsonl` +2. Set `data_sources: [ncbi]` +3. Set `anchor_type: dna` +4. Run `./generate_omics_qa.sh` + +### Generate QA for RNA Data +1. Set `input_path` to `examples/input_examples/search_rna_demo.jsonl` +2. Set `data_sources: [rnacentral]` +3. Set `anchor_type: rna` +4. Run `./generate_omics_qa.sh` diff --git a/examples/generate/generate_omics_qa/generate_omics_qa.sh b/examples/generate/generate_omics_qa/generate_omics_qa.sh new file mode 100755 index 00000000..0f1420f4 --- /dev/null +++ b/examples/generate/generate_omics_qa/generate_omics_qa.sh @@ -0,0 +1,2 @@ +python3 -m graphgen.run \ + --config_file examples/generate/generate_omics_qa/omics_qa_config.yaml diff --git a/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh b/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh new file mode 100755 index 00000000..ec178889 --- /dev/null +++ b/examples/generate/generate_omics_qa/generate_omics_qa_searched.sh @@ -0,0 +1,2 @@ +python3 -m graphgen.run \ + --config_file examples/generate/generate_omics_qa/omics_qa_config_searched.yaml diff --git a/examples/generate/generate_omics_qa/omics_qa_config.yaml b/examples/generate/generate_omics_qa/omics_qa_config.yaml new file mode 100644 index 00000000..8f7966ad --- /dev/null +++ b/examples/generate/generate_omics_qa/omics_qa_config.yaml @@ -0,0 +1,92 @@ +global_params: + working_dir: cache + graph_backend: kuzu # graph database backend, support: kuzu, networkx + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + +nodes: + - id: read_files + op_name: read + type: source + dependencies: [] + params: + input_path: + # three input files to generate DNA, RNA, and Protein data together + - examples/input_examples/search_dna_demo.jsonl + - examples/input_examples/search_rna_demo.jsonl + - examples/input_examples/search_protein_demo.jsonl + + - id: search_data + op_name: search + type: map_batch + dependencies: + - read_files + execution_params: + replicas: 1 + batch_size: 10 + params: + data_sources: [ncbi, rnacentral, uniprot] # Multi-omics: use all three data sources + # DNA search parameters + ncbi_params: + email: your_email@example.com # Required for NCBI + tool: GraphGen + use_local_blast: true + local_blast_db: path_to_your_local_blast_db/refseq_version/refseq_version + blast_num_threads: 2 + max_concurrent: 5 + # RNA search parameters + rnacentral_params: + use_local_blast: true + local_blast_db: path_to_your_local_blast_db/rnacentral_YYYYMMDD/rnacentral_YYYYMMDD + blast_num_threads: 2 + max_concurrent: 5 + # Protein search parameters + uniprot_params: + use_local_blast: true + local_blast_db: path_to_your_local_blast_db/${RELEASE}/uniprot_sprot + blast_num_threads: 2 + max_concurrent: 5 + + - id: chunk_documents + op_name: chunk + type: map_batch + dependencies: + - search_data + execution_params: + replicas: 4 + params: + chunk_size: 1024 # chunk size for text splitting + chunk_overlap: 100 # chunk overlap for text splitting + sequence_chunk_size: 1000 # For sequence chunks (bp for DNA/RNA, aa for protein) + sequence_chunk_overlap: 100 + + - id: build_kg + op_name: build_kg + type: map_batch + dependencies: + - chunk_documents + execution_params: + replicas: 1 + batch_size: 128 + + - id: partition + op_name: partition + type: aggregate + dependencies: + - build_kg + params: + method: anchor_bfs # partition method + method_params: + anchor_type: [dna, rna, protein] # Multi-omics: support multiple anchor types (list or single string) + max_units_per_community: 10 # max nodes and edges per community + + - id: generate + op_name: generate + type: map_batch + dependencies: + - partition + execution_params: + replicas: 1 + batch_size: 128 + params: + method: omics_qa # unified QA generation method for DNA/RNA/Protein + data_format: ChatML # Alpaca, Sharegpt, ChatML diff --git a/examples/generate/generate_omics_qa/omics_qa_config_searched.yaml b/examples/generate/generate_omics_qa/omics_qa_config_searched.yaml new file mode 100644 index 00000000..cf01bc65 --- /dev/null +++ b/examples/generate/generate_omics_qa/omics_qa_config_searched.yaml @@ -0,0 +1,73 @@ +global_params: + working_dir: cache + graph_backend: kuzu # graph database backend, support: kuzu, networkx + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + +nodes: + - id: read_files + op_name: read + type: source + dependencies: [] + params: + input_path: + # Use pre-searched data files (skip search step) + # The search_service will automatically detect and skip search if data already contains search results + - examples/input_examples/searched_dna_demo.jsonl + - examples/input_examples/searched_rna_demo.jsonl + - examples/input_examples/searched_protein_demo.jsonl + + - id: search_data + op_name: search + type: map_batch + dependencies: + - read_files + execution_params: + replicas: 1 + batch_size: 10 + # Note: search_service will automatically detect pre-searched data and skip search, + # but it will still normalize the data format (ensure _doc_id, content, data_source fields exist) + + - id: chunk_documents + op_name: chunk + type: map_batch + dependencies: + - search_data + execution_params: + replicas: 4 + params: + chunk_size: 1024 # chunk size for text splitting + chunk_overlap: 100 # chunk overlap for text splitting + sequence_chunk_size: 1000 # For sequence chunks (bp for DNA/RNA, aa for protein) + sequence_chunk_overlap: 100 + + - id: build_kg + op_name: build_kg + type: map_batch + dependencies: + - chunk_documents + execution_params: + replicas: 1 + batch_size: 128 + + - id: partition + op_name: partition + type: aggregate + dependencies: + - build_kg + params: + method: anchor_bfs # partition method + method_params: + anchor_type: [dna, rna, protein] # Multi-omics: support multiple anchor types (list or single string) + max_units_per_community: 10 # max nodes and edges per community + + - id: generate + op_name: generate + type: map_batch + dependencies: + - partition + execution_params: + replicas: 1 + batch_size: 128 + params: + method: omics_qa # unified QA generation method for DNA/RNA/Protein + data_format: ChatML # Alpaca, Sharegpt, ChatML diff --git a/examples/input_examples/searched_dna_demo.jsonl b/examples/input_examples/searched_dna_demo.jsonl new file mode 100644 index 00000000..05778743 --- /dev/null +++ b/examples/input_examples/searched_dna_demo.jsonl @@ -0,0 +1,3 @@ +{"_doc_id":"doc-NG_011079","type":"dna","content":"Title: Homo sapiens ribosomal protein L35a pseudogene 6 (RPL35AP6) on chromosome 1\nSequence: ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_011079","gene_name":"RPL35AP6","gene_description":"ribosomal protein L35a pseudogene 6","organism":"Homo sapiens","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_011079","gene_synonyms":["RPL35A_3_191"],"gene_type":"other","chromosome":"1","genomic_location":"1-522","function":null,"title":"Homo sapiens ribosomal protein L35a pseudogene 6 (RPL35AP6) on chromosome 1","sequence":"ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG","sequence_length":522,"gene_id":"100271312","molecule_type_detail":"genomic region","_search_query":"ACTCAATTGTCCCAGCAGCATCTACCGAAAAGCCCCCTTGCTGTTCCTGCCAACTTGAAGCCCGGAGGCCTGCTGGGAGGAGGAATTCTAAATGACAAGTATGCCTGGAAAGCTGTGGTCCAAGGCCGTTTTTGCCGTCAGCAGGATCTCCAGAACCAAAGGGAGGACACAGCTCTTCTTAAAACTGAAGGTATTTATGGCTGACATAAAATGAGATTTGATTTGGGCAGGAAATGCGCTTATGTGTACAAAGAATAATACTGACTCCTGGCAGCAAACCAAACAAAACCAGAGTAAGGTGGAGAAAGGTAACGTGTGCCCACGGAAACAGTGGCACAATGTGTGCCTAATTCCAAAGCAGCCGTCCTGCTTAGGCCACTAGTCACGGCGGCTCTGTGATGCTGTACTCCTCAAGGATTTGAACTAATGAAAAGTAAATAAATACCAGTAAAAGTGGATTTGTAAAAAGAAAAGAAAAATGATAGGAAAAGCCCCTTTACCATATGTCAAGGGTTTATGCTG"} +{"_doc_id":"doc-NG_033923","type":"dna","content":"Title: Callithrix jacchus immunity-related GTPase family, M, pseudogene (IRGMP) on chromosome 2\nSequence: GAACTCCTGACCTCAGGTGATCCACCTGCTTTGGCCTCCCAAAATGCCAGGATTACAGGTATGAGCCACCACGCCCAGCCAGCATTGGGGTATATCGAAGGCAGAGGTCATGAATGTTGAGAGAGCCTCAGCAGATGGGGACTTGCCAGAGGTGGTCTCTGCCATCAAGGAGAGTTTGAAGATAGTGTTCAGGACACCAGTCAACATCGCTATGGCAGGGGACTCTGGCAATAGCATATCCACCTTCATCAGTGCACTTCAAATCGCAGGGCATGAGGCGAAGGCCTCACCTCCTACTGGGCTGGTAAAAGCTACCCAAAGATGTGCCTCCTATTTCTCTTCCCGCTTTCCAAATGTGGTGCTGTGGGATCTGCCTGGAGCAGGGTCTGCCACCAAAACTCTGGAGAACTACCTGATGGAAATGTAGTTCAACCAATATGACTTCATCATGGTTGCATCTGCACAATTCAGCATGAATCATGTGATCCTTGCCAAAACCATTGAGGACATGGGAAAGAAGTTCTACATTGTCTGGACCAAGCTGGACATGGATCTCAGCACAGGTGCCCTCCCAGAAGTGCAGCTACTGTAAATCAGAGAAAATGTCCTGGAAAGTCTCCAGAGGGAGCAGGTATGTGAACTCCCCATATTTATGGCCTCCAGCCTTGAACCTTTATTGCATGACTTCCCAAAGCTTAGAGACACATTGCAAAAGACTCATCCAAATTAGGTGCCATGGCCCTCTTCAAAACCTGTCCCACACCTGTGAGATGATCACGAATGACAAAGCAATCTCCCTGCAGAAGAAAACAACCATACAGTCTTTCCAG","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_033923","gene_name":"IRGMP","gene_description":"immunity-related GTPase family, M, pseudogene","organism":"Callithrix jacchus","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_033923","gene_synonyms":null,"gene_type":"other","chromosome":"2","genomic_location":"1-830","function":null,"title":"Callithrix jacchus immunity-related GTPase family, M, pseudogene (IRGMP) on chromosome 2","sequence":"GAACTCCTGACCTCAGGTGATCCACCTGCTTTGGCCTCCCAAAATGCCAGGATTACAGGTATGAGCCACCACGCCCAGCCAGCATTGGGGTATATCGAAGGCAGAGGTCATGAATGTTGAGAGAGCCTCAGCAGATGGGGACTTGCCAGAGGTGGTCTCTGCCATCAAGGAGAGTTTGAAGATAGTGTTCAGGACACCAGTCAACATCGCTATGGCAGGGGACTCTGGCAATAGCATATCCACCTTCATCAGTGCACTTCAAATCGCAGGGCATGAGGCGAAGGCCTCACCTCCTACTGGGCTGGTAAAAGCTACCCAAAGATGTGCCTCCTATTTCTCTTCCCGCTTTCCAAATGTGGTGCTGTGGGATCTGCCTGGAGCAGGGTCTGCCACCAAAACTCTGGAGAACTACCTGATGGAAATGTAGTTCAACCAATATGACTTCATCATGGTTGCATCTGCACAATTCAGCATGAATCATGTGATCCTTGCCAAAACCATTGAGGACATGGGAAAGAAGTTCTACATTGTCTGGACCAAGCTGGACATGGATCTCAGCACAGGTGCCCTCCCAGAAGTGCAGCTACTGTAAATCAGAGAAAATGTCCTGGAAAGTCTCCAGAGGGAGCAGGTATGTGAACTCCCCATATTTATGGCCTCCAGCCTTGAACCTTTATTGCATGACTTCCCAAAGCTTAGAGACACATTGCAAAAGACTCATCCAAATTAGGTGCCATGGCCCTCTTCAAAACCTGTCCCACACCTGTGAGATGATCACGAATGACAAAGCAATCTCCCTGCAGAAGAAAACAACCATACAGTCTTTCCAG","sequence_length":830,"gene_id":"100409682","molecule_type_detail":"genomic region","_search_query":"NG_033923"} +{"_doc_id":"doc-NG_056118","type":"dna","content":"Title: Homo sapiens major histocompatibility complex, class II, DR beta 8 (pseudogene) (HLA-DRB8) on chromosome 6\nSequence: GCCAGAGCCTAGGTTTACAGAGAAGCAGACAAACAAAACAGCCAAACAAGGAGACTTACTCTGTCTTCATGACTCATTCCCTCTACATTTTTTCTTCTAGTCCATCCTAAGGTGACTGTGTATCCTTTAAAGACCCAGCCCCTGCAGCACCACAACCTCCTGGTCTGCTCTGTGAGTGGTTTCTGTCCAGCCAGCATTGAAGTCAGGTGGTTCCGGAACGGCCAGGAAGAGAAGGCTGGGGTGGTGTCCACAGGCCTGATCCAGAATGGAGACTGGACCTTCCAGACACTGATGATGCTGGAAACAGTTCCTCAGAGTGGAGAGGTTTACACCTGCCAAGTGGAGCATCCAAGCATGATGAGCCCTCTCACGGTGCAATGGAGTTAGCAGCTTTCTGACTTCATAAATTTTTCACCCAGTAAGTACAGGACTGTGCTAATCCCTGAGTGTCAGGTTTCTCCTCTCCCACATCCTATTTTCATTTGCTCCATATTCTCATCTCCATCAGCACAGGTCACTGGGGATAGCCCTGTAATCATTTCTAAAAGCACCTGTACCCCATGGTAAAGCAGTCATGCCTGCCAGGCGGGAGAGGCTGTCTCTCTTTTGAACCTCCCCATGATGGCACAGGTCAGGGTCACCCACTCTCCCTGGCTCCAGGCCCTGCCTCTGGGTCTGAGATTGTATTTCTGCTGCTGTTGCTCTGGGTTGTTTGTTGTGATCTGAGAAGAGGAGAACTGTAGGGGTCTTCCTGGCATGAGGGGAGTCCAATCCCAGCTCTGCCTTTTATTAGCTCTGTCACTCTAGACAAACTACTAAACCTCTTTGAGTCTCAGGATTTCTGTGGATCAGATGTCAAAGTCATGCCTTACATCAAGGCTGTAATATTTGAATGAGTTTGAGGCCTAACCTTGTAACTGTTCAGTGTGATCTGAAAACCTTTTTTCCCCAGAAATAGCTAGTTATTTTAGTTCTTGCAGGGCAGCCTTCTTCCCCATTTTCAAAGCTCTGAATCTCAGTATCTCAATTACAGAGGTTCAATTTGGGATAAAAATCACTAAACCTGGCTTCCACTCTCAGGAGCATGGTCTGAATCTGCACAGAGCAAGATGCTGAGTGGAGTCGGGGGCTTTGTGCTGGGCCTGCTCTTCCTTGGGGCCGGGCTGTTTCTCTACTTCAGGAATCAGAAAGGTGAGGAACCTTTCGTAGCTGGCTCTCTCCATAGACTTTTCTGGAGGAGGAAATATGGCTTTGCAGAGGTTAGTTCTCAGTATATGAGTGGCCCTGGATAAAGCCTTTCTTTCCCAAAACGACCTCCAATGTCCCGCTAATCCAGAAATCATCAGTGCATGGTTACTATGTCAAAGCATAATAGCTTATGGCCTGCAGAGAGAAAAGAAAGGCTAACAAGTAGGGATCCTTTGGTTGGAGATCCTGGAGCAAATTAAGGAAGAGCCACTAAGGTTAATACAATTACACTGGATCCTATGACAGACACTTCACGCTTCAGGGGTCACGTGGTGAGTTTCTGCTCCTCTCTGCCCTGGTTCATGTAAGTTGTGGTGTTAGAGAAATCTCAGGTGGGAGATCTGGGGCTGGGATATTGTGTTGGAGGACAGATTTGCTTCCATATCTTTTTTCTTTTTTCTTTTTTTTGAGACGGAGTCTCGCTCTGTCCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAACCTCCTTCTCCCGGATTCAAGTGATTCTCCTGCCTCAACCTCCCGAGTAGCTGGGACTATAGGCACCTGCCACCACGCCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAAGATGGTCTCGATCTCTTGACCTTGTGATCCACCCAACTTGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACCGCACCCGGCCTGCTTCCATATCTTTTAAATGTGTATCTTTTCCCCTTTTTCCCAGGACACTCTGGACTTCAGCCAACAGGTAATACCTTTTCATTCTCTTTTAGAAACAGATTCGCTTTCCTAGAATGATGGTAGAGGTGATAAGGGATGAGACAGAAATAATAGGAAAGACTTTGGATCCAAATTTCTGATCAGGCAATTTACGCCAAAACTCCTCTCTACTTAGAAAAGGCCTGTGCTTGGCCAGGCGCAGTAGCTCATGCCTGTAATCTCAGCACTTTGGGAGGCTGAGGCGGGTGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGACCAACAAGGAGAAACCTTGTCTCTACTAAAAATACAAAAAAAATTAGCCATGCGTGGTGGCGCATGCCTGTAATTCCAGCTACTGAGGAGGCTGAGGTAGGAGAATGGTTTGAAGCTGGGAGGCAGAGGTTGTGGTAAGCGCACCACTGCACTCCAGCCTGGGCAACAAGAGTGAAACTCCATCTGAAAAAATGAATAAATAAAAAATAAAAGGCCAGTGCTCTGCAGTAGTATTGGCTCAGGGAGACTTAGCAACTTGTTTTTCTTCTTCCTGTACTGCTTTCATCTGAGTCCCTGAAAGAGGGGGAAAGAAGCTGTTAGTAGAGCCATGTCTGAAAACAACACTCTCCTGTGTCTTCTGCAGGACTCCTGAACTGAAGTGAAGATGACCACATTCAAGGAGGAAACTTCTGCCCCAGCTTTGCAGGAGGAAAAGCTTTTCCGCTTGGCTCTTTTTTTTTTTTTTAGTTTTATTTAT","data_source":"ncbi","molecule_type":"DNA","database":"NCBI","id":"NG_056118","gene_name":"HLA-DRB8","gene_description":"major histocompatibility complex, class II, DR beta 8 (pseudogene)","organism":"Homo sapiens","url":"https:\/\/www.ncbi.nlm.nih.gov\/nuccore\/NG_056118","gene_synonyms":null,"gene_type":"other","chromosome":"6","genomic_location":"1-2737","function":null,"title":"Homo sapiens major histocompatibility complex, class II, DR beta 8 (pseudogene) (HLA-DRB8) on chromosome 6","sequence":"GCCAGAGCCTAGGTTTACAGAGAAGCAGACAAACAAAACAGCCAAACAAGGAGACTTACTCTGTCTTCATGACTCATTCCCTCTACATTTTTTCTTCTAGTCCATCCTAAGGTGACTGTGTATCCTTTAAAGACCCAGCCCCTGCAGCACCACAACCTCCTGGTCTGCTCTGTGAGTGGTTTCTGTCCAGCCAGCATTGAAGTCAGGTGGTTCCGGAACGGCCAGGAAGAGAAGGCTGGGGTGGTGTCCACAGGCCTGATCCAGAATGGAGACTGGACCTTCCAGACACTGATGATGCTGGAAACAGTTCCTCAGAGTGGAGAGGTTTACACCTGCCAAGTGGAGCATCCAAGCATGATGAGCCCTCTCACGGTGCAATGGAGTTAGCAGCTTTCTGACTTCATAAATTTTTCACCCAGTAAGTACAGGACTGTGCTAATCCCTGAGTGTCAGGTTTCTCCTCTCCCACATCCTATTTTCATTTGCTCCATATTCTCATCTCCATCAGCACAGGTCACTGGGGATAGCCCTGTAATCATTTCTAAAAGCACCTGTACCCCATGGTAAAGCAGTCATGCCTGCCAGGCGGGAGAGGCTGTCTCTCTTTTGAACCTCCCCATGATGGCACAGGTCAGGGTCACCCACTCTCCCTGGCTCCAGGCCCTGCCTCTGGGTCTGAGATTGTATTTCTGCTGCTGTTGCTCTGGGTTGTTTGTTGTGATCTGAGAAGAGGAGAACTGTAGGGGTCTTCCTGGCATGAGGGGAGTCCAATCCCAGCTCTGCCTTTTATTAGCTCTGTCACTCTAGACAAACTACTAAACCTCTTTGAGTCTCAGGATTTCTGTGGATCAGATGTCAAAGTCATGCCTTACATCAAGGCTGTAATATTTGAATGAGTTTGAGGCCTAACCTTGTAACTGTTCAGTGTGATCTGAAAACCTTTTTTCCCCAGAAATAGCTAGTTATTTTAGTTCTTGCAGGGCAGCCTTCTTCCCCATTTTCAAAGCTCTGAATCTCAGTATCTCAATTACAGAGGTTCAATTTGGGATAAAAATCACTAAACCTGGCTTCCACTCTCAGGAGCATGGTCTGAATCTGCACAGAGCAAGATGCTGAGTGGAGTCGGGGGCTTTGTGCTGGGCCTGCTCTTCCTTGGGGCCGGGCTGTTTCTCTACTTCAGGAATCAGAAAGGTGAGGAACCTTTCGTAGCTGGCTCTCTCCATAGACTTTTCTGGAGGAGGAAATATGGCTTTGCAGAGGTTAGTTCTCAGTATATGAGTGGCCCTGGATAAAGCCTTTCTTTCCCAAAACGACCTCCAATGTCCCGCTAATCCAGAAATCATCAGTGCATGGTTACTATGTCAAAGCATAATAGCTTATGGCCTGCAGAGAGAAAAGAAAGGCTAACAAGTAGGGATCCTTTGGTTGGAGATCCTGGAGCAAATTAAGGAAGAGCCACTAAGGTTAATACAATTACACTGGATCCTATGACAGACACTTCACGCTTCAGGGGTCACGTGGTGAGTTTCTGCTCCTCTCTGCCCTGGTTCATGTAAGTTGTGGTGTTAGAGAAATCTCAGGTGGGAGATCTGGGGCTGGGATATTGTGTTGGAGGACAGATTTGCTTCCATATCTTTTTTCTTTTTTCTTTTTTTTGAGACGGAGTCTCGCTCTGTCCCCAGGCTGGAGTGCAGTGGCGTGATCTTGGCTCACTGCAACCTCCTTCTCCCGGATTCAAGTGATTCTCCTGCCTCAACCTCCCGAGTAGCTGGGACTATAGGCACCTGCCACCACGCCCAGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTCACCATGTTGGCCAAGATGGTCTCGATCTCTTGACCTTGTGATCCACCCAACTTGGCCTCCCAAAGTGCTGGGATTACAGGCATGAGCCACCGCACCCGGCCTGCTTCCATATCTTTTAAATGTGTATCTTTTCCCCTTTTTCCCAGGACACTCTGGACTTCAGCCAACAGGTAATACCTTTTCATTCTCTTTTAGAAACAGATTCGCTTTCCTAGAATGATGGTAGAGGTGATAAGGGATGAGACAGAAATAATAGGAAAGACTTTGGATCCAAATTTCTGATCAGGCAATTTACGCCAAAACTCCTCTCTACTTAGAAAAGGCCTGTGCTTGGCCAGGCGCAGTAGCTCATGCCTGTAATCTCAGCACTTTGGGAGGCTGAGGCGGGTGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGACCAACAAGGAGAAACCTTGTCTCTACTAAAAATACAAAAAAAATTAGCCATGCGTGGTGGCGCATGCCTGTAATTCCAGCTACTGAGGAGGCTGAGGTAGGAGAATGGTTTGAAGCTGGGAGGCAGAGGTTGTGGTAAGCGCACCACTGCACTCCAGCCTGGGCAACAAGAGTGAAACTCCATCTGAAAAAATGAATAAATAAAAAATAAAAGGCCAGTGCTCTGCAGTAGTATTGGCTCAGGGAGACTTAGCAACTTGTTTTTCTTCTTCCTGTACTGCTTTCATCTGAGTCCCTGAAAGAGGGGGAAAGAAGCTGTTAGTAGAGCCATGTCTGAAAACAACACTCTCCTGTGTCTTCTGCAGGACTCCTGAACTGAAGTGAAGATGACCACATTCAAGGAGGAAACTTCTGCCCCAGCTTTGCAGGAGGAAAAGCTTTTCCGCTTGGCTCTTTTTTTTTTTTTTAGTTTTATTTAT","sequence_length":2737,"gene_id":"3130","molecule_type_detail":"genomic region","_search_query":"NG_056118"} diff --git a/examples/input_examples/searched_protein_demo.jsonl b/examples/input_examples/searched_protein_demo.jsonl new file mode 100644 index 00000000..47ab02ad --- /dev/null +++ b/examples/input_examples/searched_protein_demo.jsonl @@ -0,0 +1,8 @@ +{"_doc_id":"doc-P01308","type":"protein","content":"Function: ['Insulin decreases blood glucose concentration. It increases cell permeability to monosaccharides, amino acids and fatty acids. It accelerates glycolysis, the pentose phosphate cycle, and glycogen synthesis in liver.']\nSequence: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P01308","entry_name":"INS_HUMAN","gene_names":[{"Name":"INS"}],"protein_name":"Insulin","organism":"Homo sapiens","sequence":"MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN","function":["Insulin decreases blood glucose concentration. It increases cell permeability to monosaccharides, amino acids and fatty acids. It accelerates glycolysis, the pentose phosphate cycle, and glycogen synthesis in liver."],"url":"https:\/\/www.uniprot.org\/uniprot\/P01308","_search_query":"P01308"} +{"_doc_id":"doc-Q6UWZ7","type":"protein","content":"Function: [\"Involved in DNA damage response and double-strand break (DSB) repair. Component of the BRCA1-A complex, acting as a central scaffold protein that assembles the various components of the complex and mediates the recruitment of BRCA1. The BRCA1-A complex specifically recognizes 'Lys-63'-linked ubiquitinated histones H2A and H2AX at DNA lesion sites, leading to target the BRCA1-BARD1 heterodimer to sites of DNA damage at DSBs. This complex also possesses deubiquitinase activity that specifically removes 'Lys-63'-linked ubiquitin on histones H2A and H2AX. {ECO:0000269|PubMed:17525340, ECO:0000269|PubMed:17643121, ECO:0000269|PubMed:17643122, ECO:0000269|PubMed:18077395, ECO:0000269|PubMed:19261748, ECO:0000269|PubMed:22357538, ECO:0000269|PubMed:26778126}.\"]\nSequence: MEGESTSAVLSGFVLGALAFQHLNTDSDTEGFLLGEVKGEAKNSITDSQMDDVEVVYTIDIQKYIPCYQLFSFYNSSGEVNEQALKKILSNVKKNVVGWYKFRRHSDQIMTFRERLLHKNLQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPLVVANLGMSEQLGYKTVSGSCMSTGFSRAVQTHSSKFFEEDGSLKEVHKINEMYASLQEELKSICKKVEDSEQAVDKLVKDVNRLKREIEKRRGAQIQAAREKNIQKDPQENIFLCQALRTFFPNSEFLHSCVMSLKNRHVSKSSCNYNHHLDVVDNLTLMVEHTDIPEASPASTPQIIKHKALDLDDRWQFKRSRLLDTQDKRSKADTGSSNQDKASKMSSPETDEEIEKMKGFGEYSRSPTF","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"Q6UWZ7","entry_name":"ABRX1_HUMAN","gene_names":[{"Name":"ABRAXAS1 {ECO:0000312|HGNC:HGNC:25829}","Synonyms":["ABRA1 {ECO:0000312|HGNC:HGNC:25829}","CCDC98","FAM175A {ECO:0000312|HGNC:HGNC:25829}"],"ORFNames":["UNQ496\/PRO1013"]}],"protein_name":"BRCA1-A complex subunit Abraxas 1 {ECO:0000312|HGNC:HGNC:25829}","organism":"Homo sapiens","sequence":"MEGESTSAVLSGFVLGALAFQHLNTDSDTEGFLLGEVKGEAKNSITDSQMDDVEVVYTIDIQKYIPCYQLFSFYNSSGEVNEQALKKILSNVKKNVVGWYKFRRHSDQIMTFRERLLHKNLQEHFSNQDLVFLLLTPSIITESCSTHRLEHSLYKPQKGLFHRVPLVVANLGMSEQLGYKTVSGSCMSTGFSRAVQTHSSKFFEEDGSLKEVHKINEMYASLQEELKSICKKVEDSEQAVDKLVKDVNRLKREIEKRRGAQIQAAREKNIQKDPQENIFLCQALRTFFPNSEFLHSCVMSLKNRHVSKSSCNYNHHLDVVDNLTLMVEHTDIPEASPASTPQIIKHKALDLDDRWQFKRSRLLDTQDKRSKADTGSSNQDKASKMSSPETDEEIEKMKGFGEYSRSPTF","function":["Involved in DNA damage response and double-strand break (DSB) repair. Component of the BRCA1-A complex, acting as a central scaffold protein that assembles the various components of the complex and mediates the recruitment of BRCA1. The BRCA1-A complex specifically recognizes 'Lys-63'-linked ubiquitinated histones H2A and H2AX at DNA lesion sites, leading to target the BRCA1-BARD1 heterodimer to sites of DNA damage at DSBs. This complex also possesses deubiquitinase activity that specifically removes 'Lys-63'-linked ubiquitin on histones H2A and H2AX. {ECO:0000269|PubMed:17525340, ECO:0000269|PubMed:17643121, ECO:0000269|PubMed:17643122, ECO:0000269|PubMed:18077395, ECO:0000269|PubMed:19261748, ECO:0000269|PubMed:22357538, ECO:0000269|PubMed:26778126}."],"url":"https:\/\/www.uniprot.org\/uniprot\/Q6UWZ7","_search_query":"BRCA1"} +{"_doc_id":"doc-P27355","type":"protein","content":"Function: ['Responsible for the initial oxygenation of methane to methanol in methanotrophs. It also catalyzes the monohydroxylation of a variety of unactivated alkenes, alicyclic, aromatic and heterocyclic compounds.']\nSequence: MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDRAAVEATWIAKIKAAKSKYEADGIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKEPGVKVLHLQA","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P27355","entry_name":"MEMG_METTR","gene_names":[{"Name":"mmoZ"}],"protein_name":"Methane monooxygenase component A gamma chain","organism":"Methylosinus trichosporium.","sequence":"MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDRAAVEATWIAKIKAAKSKYEADGIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKEPGVKVLHLQA","function":["Responsible for the initial oxygenation of methane to methanol in methanotrophs. It also catalyzes the monohydroxylation of a variety of unactivated alkenes, alicyclic, aromatic and heterocyclic compounds."],"url":"https:\/\/www.uniprot.org\/uniprot\/P27355","_search_query":"MAKREPIHDNSIRTEWEAKIAKLTSVDQATKFIQDFRLAYTSPFRKSYDIDVDYQYIERKIEEKLSVLKTEKLPVADLITKATTGEDAAAVEATWIAKIKAAKSKYEAEAIHIEFRQLYKPPVLPVNVFLRTDAALGTVLMEIRNTDYYGTPLEGLRKERGVKVLHLQA"} +{"_doc_id":"doc-Q96GG9","type":"protein","content":"Function: ['Part of an E3 ubiquitin ligase complex for neddylation (PubMed:18826954). Promotes neddylation of cullin components of E3 cullin-RING ubiquitin ligase complexes (PubMed:19617556, PubMed:23201271, PubMed:23401859, PubMed:26906416). Acts by binding to cullin-RBX1 complexes in the cytoplasm and promoting their nuclear translocation, enhancing recruitment of E2-NEDD8 (UBE2M-NEDD8) thioester to the complex, and optimizing the orientation of proteins in the complex to allow efficient transfer of NEDD8 from the E2 to the cullin substrates. Involved in the release of inhibitory effets of CAND1 on cullin-RING ligase E3 complex assembly and activity (PubMed:25349211, PubMed:28581483). Also acts as an oncogene facilitating malignant transformation and carcinogenic progression (By similarity). {ECO:0000250|UniProtKB:Q9QZ73, ECO:0000269|PubMed:18826954, ECO:0000269|PubMed:19617556, ECO:0000269|PubMed:23201271, ECO:0000269|PubMed:23401859, ECO:0000269|PubMed:25349211, ECO:0000269|PubMed:26906416, ECO:0000269|PubMed:28581483}.']\nSequence: MNKLKSSQKDKVRQFMIFTQSSEKTAVSCLSQNDWKLDVATDNFFQNPELYIRESVKGSLDRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"Q96GG9","entry_name":"DCNL1_HUMAN","gene_names":[{"Name":"DCUN1D1 {ECO:0000312|HGNC:HGNC:18184}","Synonyms":["DCN1 {ECO:0000303|PubMed:28581483}","DCUN1L1","RP42","SCCRO"]}],"protein_name":"DCN1-like protein 1 {ECO:0000305}","organism":"Homo sapiens","sequence":"MNKLKSSQKDKVRQFMIFTQSSEKTAVSCLSQNDWKLDVATDNFFQNPELYIRESVKGSLDRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV","function":["Part of an E3 ubiquitin ligase complex for neddylation (PubMed:18826954). Promotes neddylation of cullin components of E3 cullin-RING ubiquitin ligase complexes (PubMed:19617556, PubMed:23201271, PubMed:23401859, PubMed:26906416). Acts by binding to cullin-RBX1 complexes in the cytoplasm and promoting their nuclear translocation, enhancing recruitment of E2-NEDD8 (UBE2M-NEDD8) thioester to the complex, and optimizing the orientation of proteins in the complex to allow efficient transfer of NEDD8 from the E2 to the cullin substrates. Involved in the release of inhibitory effets of CAND1 on cullin-RING ligase E3 complex assembly and activity (PubMed:25349211, PubMed:28581483). Also acts as an oncogene facilitating malignant transformation and carcinogenic progression (By similarity). {ECO:0000250|UniProtKB:Q9QZ73, ECO:0000269|PubMed:18826954, ECO:0000269|PubMed:19617556, ECO:0000269|PubMed:23201271, ECO:0000269|PubMed:23401859, ECO:0000269|PubMed:25349211, ECO:0000269|PubMed:26906416, ECO:0000269|PubMed:28581483}."],"url":"https:\/\/www.uniprot.org\/uniprot\/Q96GG9","_search_query":"MGSSHHHHHHSQDLENLYFQGSMNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRTKRVITTFRTGTWDAYKNLRKKLEQLYNRYKDPQDENKIGIDGIQQFCDDLALDPASISVLIIAWKFRAATQCEFSKQEFMDGMTELGCDSIEKLKAQIPKMEQELKEPGRFKDFYQFTFNFAKNPGQKGLDLEMAIAYWNLVLNGRFKFLDLWNKFLLEHHKRSIPKDTWNLLLDFSTMIADDMSNYDEEGAWPVLIDDFVEFARPQIAGTKSTTV"} +{"_doc_id":"doc-P68871","type":"protein","content":"Function: ['Involved in oxygen transport from the lung to the various peripheral tissues. {ECO:0000269|PubMed:28066926}.', 'LVV-hemorphin-7 potentiates the activity of bradykinin, causing a decrease in blood pressure.', '[Spinorphin]: Functions as an endogenous inhibitor of enkephalin-degrading enzymes such as DPP3, and as a selective antagonist of the P2RX3 receptor which is involved in pain signaling, these properties implicate it as a regulator of pain and inflammation.']\nSequence: MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P68871","entry_name":"HBB_HUMAN","gene_names":[{"Name":"HBB"}],"protein_name":"Hemoglobin subunit beta","organism":"Homo sapiens","sequence":"MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH","function":["Involved in oxygen transport from the lung to the various peripheral tissues. {ECO:0000269|PubMed:28066926}.","LVV-hemorphin-7 potentiates the activity of bradykinin, causing a decrease in blood pressure.","[Spinorphin]: Functions as an endogenous inhibitor of enkephalin-degrading enzymes such as DPP3, and as a selective antagonist of the P2RX3 receptor which is involved in pain signaling, these properties implicate it as a regulator of pain and inflammation."],"url":"https:\/\/www.uniprot.org\/uniprot\/P68871","_search_query":"P68871"} +{"_doc_id":"doc-P22939","type":"protein","content":"Sequence: MDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDADMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQSLKQLAEQSLDTSALEALADYIIQRNK","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P22939","entry_name":"ISPA_ECOLI","gene_names":[{"Name":"ispA","OrderedLocusNames":["b0421","JW0411"]}],"protein_name":"Farnesyl diphosphate synthase","organism":"Escherichia coli","sequence":"MDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDADMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQSLKQLAEQSLDTSALEALADYIIQRNK","function":[],"url":"https:\/\/www.uniprot.org\/uniprot\/P22939","_search_query":"MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} +{"_doc_id":"doc-Q8I8V0","type":"protein","content":"Function: ['Component of several Gcn5-containing histone acetyltransferase complexes that regulate nucleosome organization; involved in acetylation of histone H3, particularly on Lys-10 (H3K9ac) and Lys-15 (H3K14ac) (PubMed:12482983, PubMed:12697829, PubMed:15340070, PubMed:19740772, PubMed:22796493). Regulates the transcription of a subset of genes during development; affects recruitment of RNA polymerase II (PubMed:19740772, PubMed:23336284). May be involved in the function of some acidic activation domains, which activate transcription at distant sites (PubMed:12697829). Involved in the p53-dependent apoptosis pathway response to DNA damage by genotoxic agents (PubMed:15340070, PubMed:16135810). {ECO:0000269|PubMed:12482983, ECO:0000269|PubMed:12697829, ECO:0000269|PubMed:15340070, ECO:0000269|PubMed:16135810, ECO:0000269|PubMed:19740772, ECO:0000269|PubMed:22796493, ECO:0000269|PubMed:23336284}.', '[Isoform B]: Component of the SAGA histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}.', '[Isoform A]: Component of the CHAT histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}.']\nSequence: MTTIADLFTKYNCTNCQDDIQGIRVHCAECENFDLCLQCFAAGAEIGAHQNNHSYQFMDTGTSILSVFRGKGAWTAREEIRLLDAIEQYGFGNWEDISKHIETKSAEDAKEEYVNKFVNGTIGRATWTPAQSQRPRLIDHTGDDDAGPLGTNALSTLPPLEINSDEAMQLGYMPNRDSFEREYDPTAEQLISNISLSSEDTEVDVMLKLAHVDIYTRRLRERARRKRMVRDYQLVSNFFRNRNYAQQQGLTKEQREFRDRFRVYAQFYTCNEYERLLGSLEREKELRIRQSELYRYRYNGLTKIAECTHFEQHAATATHRSTGPYGHGKTDHTHTSNGSHRPPSSSLHSPQPNLRKVEMSSGGEASSNSIAPRNTLHIADPTCSGALLPSKNYLDSCRGSSAATMLQTTGMVMGVTVDSGATTGVTSTATTMANLPTNSAKGSQQHLQPLQQHPQLLQSGNQHKMQNEAAGGGSDQVPSMSLKLRTQLEELKHLPQPPGSELLSHNELDLCKKHNITPTTYLSVKTVCLSGAPSLGSPMETSLRKFFIKCGWLSH","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"Q8I8V0","entry_name":"TAD2B_DROME","gene_names":[{"Name":"Ada2b {ECO:0000312|FlyBase:FBgn0037555}","Synonyms":["Ada2S {ECO:0000303|PubMed:12697829}"],"ORFNames":["CG9638 {ECO:0000312|FlyBase:FBgn0037555}"]}],"protein_name":"Transcriptional adapter 2b {ECO:0000312|FlyBase:FBgn0037555}","organism":"Drosophila melanogaster","sequence":"MTTIADLFTKYNCTNCQDDIQGIRVHCAECENFDLCLQCFAAGAEIGAHQNNHSYQFMDTGTSILSVFRGKGAWTAREEIRLLDAIEQYGFGNWEDISKHIETKSAEDAKEEYVNKFVNGTIGRATWTPAQSQRPRLIDHTGDDDAGPLGTNALSTLPPLEINSDEAMQLGYMPNRDSFEREYDPTAEQLISNISLSSEDTEVDVMLKLAHVDIYTRRLRERARRKRMVRDYQLVSNFFRNRNYAQQQGLTKEQREFRDRFRVYAQFYTCNEYERLLGSLEREKELRIRQSELYRYRYNGLTKIAECTHFEQHAATATHRSTGPYGHGKTDHTHTSNGSHRPPSSSLHSPQPNLRKVEMSSGGEASSNSIAPRNTLHIADPTCSGALLPSKNYLDSCRGSSAATMLQTTGMVMGVTVDSGATTGVTSTATTMANLPTNSAKGSQQHLQPLQQHPQLLQSGNQHKMQNEAAGGGSDQVPSMSLKLRTQLEELKHLPQPPGSELLSHNELDLCKKHNITPTTYLSVKTVCLSGAPSLGSPMETSLRKFFIKCGWLSH","function":["Component of several Gcn5-containing histone acetyltransferase complexes that regulate nucleosome organization; involved in acetylation of histone H3, particularly on Lys-10 (H3K9ac) and Lys-15 (H3K14ac) (PubMed:12482983, PubMed:12697829, PubMed:15340070, PubMed:19740772, PubMed:22796493). Regulates the transcription of a subset of genes during development; affects recruitment of RNA polymerase II (PubMed:19740772, PubMed:23336284). May be involved in the function of some acidic activation domains, which activate transcription at distant sites (PubMed:12697829). Involved in the p53-dependent apoptosis pathway response to DNA damage by genotoxic agents (PubMed:15340070, PubMed:16135810). {ECO:0000269|PubMed:12482983, ECO:0000269|PubMed:12697829, ECO:0000269|PubMed:15340070, ECO:0000269|PubMed:16135810, ECO:0000269|PubMed:19740772, ECO:0000269|PubMed:22796493, ECO:0000269|PubMed:23336284}.","[Isoform B]: Component of the SAGA histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}.","[Isoform A]: Component of the CHAT histone acetyltransferase complex, which predominantly acetylates histone H3. {ECO:0000269|PubMed:30559249}."],"url":"https:\/\/www.uniprot.org\/uniprot\/Q8I8V0","_search_query":"p53"} +{"_doc_id":"doc-P04637","type":"protein","content":"Function: ['Multifunctional transcription factor that induces cell cycle arrest, DNA repair or apoptosis upon binding to its target DNA sequence (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:35618207, PubMed:36634798, PubMed:38653238, PubMed:9840937). Acts as a tumor suppressor in many tumor types; induces growth arrest or apoptosis depending on the physiological circumstances and cell type (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17189187, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:38653238, PubMed:9840937). Negatively regulates cell division by controlling expression of a set of genes required for this process (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:9840937). One of the activated genes is an inhibitor of cyclin-dependent kinases. Apoptosis induction seems to be mediated either by stimulation of BAX and FAS antigen expression, or by repression of Bcl-2 expression (PubMed:12524540, PubMed:17189187). Its pro-apoptotic activity is activated via its interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 (PubMed:12524540). However, this activity is inhibited when the interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 is displaced by PPP1R13L\/iASPP (PubMed:12524540). In cooperation with mitochondrial PPIF is involved in activating oxidative stress-induced necrosis; the function is largely independent of transcription. Induces the transcription of long intergenic non-coding RNA p21 (lincRNA-p21) and lincRNA-Mkln1. LincRNA-p21 participates in TP53-dependent transcriptional repression leading to apoptosis and seems to have an effect on cell-cycle regulation. Implicated in Notch signaling cross-over. Prevents CDK7 kinase activity when associated to CAK complex in response to DNA damage, thus stopping cell cycle progression. Isoform 2 enhances the transactivation activity of isoform 1 from some but not all TP53-inducible promoters. Isoform 4 suppresses transactivation activity and impairs growth suppression mediated by isoform 1. Isoform 7 inhibits isoform 1-mediated apoptosis. Regulates the circadian clock by repressing CLOCK-BMAL1-mediated transcriptional activation of PER2 (PubMed:24051492). {ECO:0000269|PubMed:11025664, ECO:0000269|PubMed:12524540, ECO:0000269|PubMed:12810724, ECO:0000269|PubMed:15186775, ECO:0000269|PubMed:15340061, ECO:0000269|PubMed:17189187, ECO:0000269|PubMed:17317671, ECO:0000269|PubMed:17349958, ECO:0000269|PubMed:19556538, ECO:0000269|PubMed:20673990, ECO:0000269|PubMed:20959462, ECO:0000269|PubMed:22726440, ECO:0000269|PubMed:24051492, ECO:0000269|PubMed:24652652, ECO:0000269|PubMed:35618207, ECO:0000269|PubMed:36634798, ECO:0000269|PubMed:38653238, ECO:0000269|PubMed:9840937}.']\nSequence: MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD","data_source":"uniprot","molecule_type":"protein","database":"UniProt","id":"P04637","entry_name":"P53_HUMAN","gene_names":[{"Name":"TP53","Synonyms":["P53"]}],"protein_name":"Cellular tumor antigen p53","organism":"Homo sapiens","sequence":"MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD","function":["Multifunctional transcription factor that induces cell cycle arrest, DNA repair or apoptosis upon binding to its target DNA sequence (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:35618207, PubMed:36634798, PubMed:38653238, PubMed:9840937). Acts as a tumor suppressor in many tumor types; induces growth arrest or apoptosis depending on the physiological circumstances and cell type (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17189187, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:38653238, PubMed:9840937). Negatively regulates cell division by controlling expression of a set of genes required for this process (PubMed:11025664, PubMed:12524540, PubMed:12810724, PubMed:15186775, PubMed:15340061, PubMed:17317671, PubMed:17349958, PubMed:19556538, PubMed:20673990, PubMed:20959462, PubMed:22726440, PubMed:24051492, PubMed:24652652, PubMed:9840937). One of the activated genes is an inhibitor of cyclin-dependent kinases. Apoptosis induction seems to be mediated either by stimulation of BAX and FAS antigen expression, or by repression of Bcl-2 expression (PubMed:12524540, PubMed:17189187). Its pro-apoptotic activity is activated via its interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 (PubMed:12524540). However, this activity is inhibited when the interaction with PPP1R13B\/ASPP1 or TP53BP2\/ASPP2 is displaced by PPP1R13L\/iASPP (PubMed:12524540). In cooperation with mitochondrial PPIF is involved in activating oxidative stress-induced necrosis; the function is largely independent of transcription. Induces the transcription of long intergenic non-coding RNA p21 (lincRNA-p21) and lincRNA-Mkln1. LincRNA-p21 participates in TP53-dependent transcriptional repression leading to apoptosis and seems to have an effect on cell-cycle regulation. Implicated in Notch signaling cross-over. Prevents CDK7 kinase activity when associated to CAK complex in response to DNA damage, thus stopping cell cycle progression. Isoform 2 enhances the transactivation activity of isoform 1 from some but not all TP53-inducible promoters. Isoform 4 suppresses transactivation activity and impairs growth suppression mediated by isoform 1. Isoform 7 inhibits isoform 1-mediated apoptosis. Regulates the circadian clock by repressing CLOCK-BMAL1-mediated transcriptional activation of PER2 (PubMed:24051492). {ECO:0000269|PubMed:11025664, ECO:0000269|PubMed:12524540, ECO:0000269|PubMed:12810724, ECO:0000269|PubMed:15186775, ECO:0000269|PubMed:15340061, ECO:0000269|PubMed:17189187, ECO:0000269|PubMed:17317671, ECO:0000269|PubMed:17349958, ECO:0000269|PubMed:19556538, ECO:0000269|PubMed:20673990, ECO:0000269|PubMed:20959462, ECO:0000269|PubMed:22726440, ECO:0000269|PubMed:24051492, ECO:0000269|PubMed:24652652, ECO:0000269|PubMed:35618207, ECO:0000269|PubMed:36634798, ECO:0000269|PubMed:38653238, ECO:0000269|PubMed:9840937}."],"url":"https:\/\/www.uniprot.org\/uniprot\/P04637","_search_query":"P04637"} diff --git a/examples/input_examples/searched_rna_demo.jsonl b/examples/input_examples/searched_rna_demo.jsonl new file mode 100644 index 00000000..9ad088c0 --- /dev/null +++ b/examples/input_examples/searched_rna_demo.jsonl @@ -0,0 +1,6 @@ +{"_doc_id":"doc-URS0000123456","type":"rna","content":"Description: rRNA from 1 species\nSequence: CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000123456","rnacentral_id":"URS0000123456","sequence":"CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","sequence_length":282,"rna_type":"rRNA","description":"rRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000123456","organism":"uncultured Staphylococcus sp.","related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"URS0000123456"} +{"_doc_id":"doc-URS00000088CC","type":"rna","content":"Description: lncRNA from 1 species\nSequence: GCAGUUCUCAGCCAUGACAGAUGGGAGUUUCGGCCCAAUUGACCAGUAUUCCUUACUGAUAAGAGACACUGACCAUGGAGUGGUUCUGGUGAGAUGACAUGACCCUCGUGAAGGGGCCUGAAGCUUCAUUGUGUUUGUGUAUGUUUCUCUCUUCAAAAAUAUUCAUGACUUCUCCUGUAGCUUGAUAAAUAUGUAUAUUUACACACUGCA","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS00000088CC","rnacentral_id":"URS00000088CC","sequence":"GCAGUUCUCAGCCAUGACAGAUGGGAGUUUCGGCCCAAUUGACCAGUAUUCCUUACUGAUAAGAGACACUGACCAUGGAGUGGUUCUGGUGAGAUGACAUGACCCUCGUGAAGGGGCCUGAAGCUUCAUUGUGUUUGUGUAUGUUUCUCUCUUCAAAAAUAUUCAUGACUUCUCCUGUAGCUUGAUAAAUAUGUAUAUUUACACACUGCA","sequence_length":210,"rna_type":"lncRNA","description":"lncRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS00000088CC","organism":"Homo sapiens","related_genes":["ENSG00000265458.1","lnc-C17orf62-1","ENSG00000265458","NONHSAG023099","HSALNG0119438","NONHSAG023099.2","ENSG00000265458.4","RP13-20L14.6","NARF-AS2"],"gene_name":"ENSG00000265458, ENSG00000265458.1, ENSG00000265458.4, HSALNG0119438, NARF-AS2, NONHSAG023099, NONHSAG023099.2, RP13-20L14.6, lnc-C17orf62-1","so_term":"antisense, ncRNA","modifications":null,"_search_query":"GCAGTTCTCAGCCATGACAGATGGGAGTTTCGGCCCAATTGACCAGTATTCCTTACTGATAAGAGACACTGACCATGGAGTGGTTCTGGTGAGATGACATGACCCTCGTGAAGGGGCCTGAAGCTTCATTGTGTTTGTGTATGTTTCTCTCTTCAAAAATATTCATGACTTCTCCTGTAGCTTGATAAATATGTATATTTACACACTGCA"} +{"_doc_id":"doc-URS000342178E","type":"rna","content":"Description: None misc RNA\nSequence: GGUUUUCGUAUAUCCUUAAUGAUAUGGUUUAAGGGCAAUACAUAGAAACCACAAAUUUCUUACUGCGAAAAUC","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS000342178E","rnacentral_id":"URS000342178E","sequence":"GGUUUUCGUAUAUCCUUAAUGAUAUGGUUUAAGGGCAAUACAUAGAAACCACAAAUUUCUUACUGCGAAAAUC","sequence_length":73,"rna_type":"misc_RNA","description":"None misc RNA","url":"https:\/\/rnacentral.org\/rna\/URS000342178E","organism":null,"related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"XIST regulator"} +{"_doc_id":"doc-URS0000123456","type":"rna","content":"Description: rRNA from 1 species\nSequence: CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000123456","rnacentral_id":"URS0000123456","sequence":"CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG","sequence_length":282,"rna_type":"rRNA","description":"rRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000123456","organism":"uncultured Staphylococcus sp.","related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG"} +{"_doc_id":"doc-URS0000000787","type":"rna","content":"Description: lncRNA from 1 species\nSequence: AGGGAUCUUCUGCCCUUGGUCCUAAGUGCCACUAUCUGUGCUGAGUUUUUCAAAGGUCAGAGCAGAUUGAACCAUUGUGGUUUCAUUUUCCCUGAUUUUGAUUUUUCUUAUGGGGAACCUGUGUGGCUGCAUUCAAGGUGACUCGAAGAAGCCUUCCAAAAAGCAUGUGAAAAGGAAGCCCUACUCUACUACCAAGGUGACUUCAGGGAGCACAUUCAAUGGUACGUAUUCUGGAAUCACUCACUGGUUGUUAGAAAAGGAUUCUACAGGAAAUCUGGAGCUUAACUGCUGGCUUUUGUCUGGAGAGCCUCCAUGAUCCAAGACAUCUGGUGGGAAUGAGGAUGUAGGGUAUAGUAAAAGAAACUGGUUUUCCUGGUGACAUACUCUUUUUAUCUAUGUAUAGUUUCUGGGAACAUGUUCACAUUAGGUUGUGUGUGGGUAUGUGUGUAUUAGGGCGGGGGUGGGGUGAGGUGGUCUGUGUGCAAGUCUGCAUGAUUUGCUUGUGAAUGUGUGUCUAUGUGUGUUUCCCCUAGGAAAAAAAUGUUGUGUUUACCCAGCACAACUCUCAGUGCCAUU","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000000787","rnacentral_id":"URS0000000787","sequence":"AGGGAUCUUCUGCCCUUGGUCCUAAGUGCCACUAUCUGUGCUGAGUUUUUCAAAGGUCAGAGCAGAUUGAACCAUUGUGGUUUCAUUUUCCCUGAUUUUGAUUUUUCUUAUGGGGAACCUGUGUGGCUGCAUUCAAGGUGACUCGAAGAAGCCUUCCAAAAAGCAUGUGAAAAGGAAGCCCUACUCUACUACCAAGGUGACUUCAGGGAGCACAUUCAAUGGUACGUAUUCUGGAAUCACUCACUGGUUGUUAGAAAAGGAUUCUACAGGAAAUCUGGAGCUUAACUGCUGGCUUUUGUCUGGAGAGCCUCCAUGAUCCAAGACAUCUGGUGGGAAUGAGGAUGUAGGGUAUAGUAAAAGAAACUGGUUUUCCUGGUGACAUACUCUUUUUAUCUAUGUAUAGUUUCUGGGAACAUGUUCACAUUAGGUUGUGUGUGGGUAUGUGUGUAUUAGGGCGGGGGUGGGGUGAGGUGGUCUGUGUGCAAGUCUGCAUGAUUUGCUUGUGAAUGUGUGUCUAUGUGUGUUUCCCCUAGGAAAAAAAUGUUGUGUUUACCCAGCACAACUCUCAGUGCCAUU","sequence_length":576,"rna_type":"lncRNA","description":"lncRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000000787","organism":"Homo sapiens","related_genes":["KB-1183D5.13","lnc-GGT2-26","ENSG00000206142.10","ENSG00000206142.9","NONHSAG033362.2","FAM230H","NONHSAG033362","lnc-GGT2-4","ENSG00000206142","lnc-GGT2-2","HSALNG0134219"],"gene_name":"ENSG00000206142, ENSG00000206142.10, ENSG00000206142.9, FAM230H, HSALNG0134219, KB-1183D5.13, NONHSAG033362, NONHSAG033362.2, lnc-GGT2-2, lnc-GGT2-26, lnc-GGT2-4","so_term":"lincRNA, ncRNA","modifications":null,"_search_query":"URS0000000787"} +{"_doc_id":"doc-URS0000000001","type":"rna","content":"Description: rRNA from 1 species\nSequence: AUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGUAGAGAGAAGCUUGCUUCUCUUGAGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGAUAACGCUCGGAAACGGACGCUAAUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGC","data_source":"rnacentral","molecule_type":"RNA","database":"RNAcentral","id":"URS0000000001","rnacentral_id":"URS0000000001","sequence":"AUUGAACGCUGGCGGCAGGCCUAACACAUGCAAGUCGAGCGGUAGAGAGAAGCUUGCUUCUCUUGAGAGCGGCGGACGGGUGAGUAAUGCCUAGGAAUCUGCCUGGUAGUGGGGGAUAACGCUCGGAAACGGACGCUAAUACCGCAUACGUCCUACGGGAGAAAGCAGGGGACCUUCGGGCCUUGCGCUAUCAGAUGAGC","sequence_length":200,"rna_type":"rRNA","description":"rRNA from 1 species","url":"https:\/\/rnacentral.org\/rna\/URS0000000001","organism":"uncultured bacterium","related_genes":null,"gene_name":null,"so_term":"ncRNA","modifications":null,"_search_query":"URS0000000001"} diff --git a/examples/search/build_db/build_protein_blast_db.sh b/examples/search/build_db/build_protein_blast_db.sh deleted file mode 100755 index 9292875a..00000000 --- a/examples/search/build_db/build_protein_blast_db.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash - -set -e - -# Downloads the latest release of UniProt, putting it in a release-specific directory. -# Creates associated BLAST databases. -# We need makeblastdb on our PATH -# For Ubuntu/Debian: sudo apt install ncbi-blast+ -# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ -# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ - -# Better to use a stable DOWNLOAD_TMP name to support resuming downloads -DOWNLOAD_TMP=_downloading -mkdir -p ${DOWNLOAD_TMP} -cd ${DOWNLOAD_TMP} - -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/RELEASE.metalink" - -# Extract the release name (like 2017_10 or 2017_1) -# Use sed for cross-platform compatibility (works on both macOS and Linux) -RELEASE=$(sed -n 's/.*\([0-9]\{4\}_[0-9]\{1,2\}\)<\/version>.*/\1/p' RELEASE.metalink | head -1) - -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt" -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/README" -wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/LICENSE" - -cd .. - -mkdir ${RELEASE} -mv ${DOWNLOAD_TMP}/* ${RELEASE} -rmdir ${DOWNLOAD_TMP} - -cd ${RELEASE} - -gunzip uniprot_sprot.fasta.gz -gunzip uniprot_trembl.fasta.gz - -cat uniprot_sprot.fasta uniprot_trembl.fasta >uniprot_${RELEASE}.fasta - -makeblastdb -in uniprot_${RELEASE}.fasta -out uniprot_${RELEASE} -dbtype prot -parse_seqids -title uniprot_${RELEASE} -makeblastdb -in uniprot_sprot.fasta -out uniprot_sprot -dbtype prot -parse_seqids -title uniprot_sprot -makeblastdb -in uniprot_trembl.fasta -out uniprot_trembl -dbtype prot -parse_seqids -title uniprot_trembl - -cd .. - -echo "BLAST databases created successfully!" -echo "Database locations:" -echo " - Combined: $(pwd)/${RELEASE}/uniprot_${RELEASE}" -echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot" -echo " - TrEMBL: $(pwd)/${RELEASE}/uniprot_trembl" -echo "" -echo "To use these databases, set in your config:" -echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot # or uniprot_${RELEASE} or uniprot_trembl" - diff --git a/examples/search/build_db/build_rna_blast_db.sh b/examples/search/build_db/build_rna_blast_db.sh deleted file mode 100755 index 26e1cd33..00000000 --- a/examples/search/build_db/build_rna_blast_db.sh +++ /dev/null @@ -1,219 +0,0 @@ -#!/bin/bash - -set -e - -# Downloads RNAcentral sequences and creates BLAST databases. -# This script downloads the RNAcentral active database, which is the same -# data source used for online RNAcentral searches, ensuring consistency -# between local and online search results. -# -# RNAcentral is a comprehensive database of non-coding RNA sequences that -# integrates data from multiple expert databases including RefSeq, Rfam, etc. -# -# Usage: ./build_rna_blast_db.sh [all|list|database_name] -# all (default): Download complete active database (~8.4G compressed) -# list: List all available database subsets -# database_name: Download specific database subset (e.g., refseq, rfam, mirbase) -# -# Available database subsets (examples): -# - refseq.fasta (~98M): RefSeq RNA sequences -# - rfam.fasta (~1.5G): Rfam RNA families -# - mirbase.fasta (~10M): microRNA sequences -# - ensembl.fasta (~2.9G): Ensembl annotations -# - See "list" option for complete list -# -# The complete "active" database contains all sequences from all expert databases. -# Using a specific database subset provides a smaller, focused database. -# -# We need makeblastdb on our PATH -# For Ubuntu/Debian: sudo apt install ncbi-blast+ -# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ -# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ - -# RNAcentral HTTP base URL (using HTTPS for better reliability) -RNACENTRAL_BASE="https://ftp.ebi.ac.uk/pub/databases/RNAcentral" -RNACENTRAL_RELEASE_URL="${RNACENTRAL_BASE}/current_release" -RNACENTRAL_SEQUENCES_URL="${RNACENTRAL_RELEASE_URL}/sequences" -RNACENTRAL_BY_DB_URL="${RNACENTRAL_SEQUENCES_URL}/by-database" - -# Parse command line argument -DB_SELECTION=${1:-all} - -# List available databases if requested -if [ "${DB_SELECTION}" = "list" ]; then - echo "Available RNAcentral database subsets:" - echo "" - echo "Fetching list from RNAcentral FTP..." - listing=$(curl -s "${RNACENTRAL_BY_DB_URL}/") - echo "${listing}" | \ - grep -oE '' | \ - sed 's///' | \ - sort | \ - while read db; do - size=$(echo "${listing}" | grep -A 1 "${db}" | grep -oE '[0-9.]+[GMK]' | head -1 || echo "unknown") - echo " - ${db%.fasta}: ${size}" - done - echo "" - echo "Usage: $0 [database_name]" - echo " Example: $0 refseq # Download only RefSeq sequences (~98M)" - echo " Example: $0 rfam # Download only Rfam sequences (~1.5G)" - echo " Example: $0 all # Download complete active database (~8.4G)" - exit 0 -fi - -# Better to use a stable DOWNLOAD_TMP name to support resuming downloads -DOWNLOAD_TMP=_downloading_rnacentral -mkdir -p ${DOWNLOAD_TMP} -cd ${DOWNLOAD_TMP} - -# Get RNAcentral release version from release notes -echo "Getting RNAcentral release information..." -RELEASE_NOTES_URL="${RNACENTRAL_RELEASE_URL}/release_notes.txt" -RELEASE_NOTES="release_notes.txt" -wget -q "${RELEASE_NOTES_URL}" 2>/dev/null || { - echo "Warning: Could not download release notes, using current date as release identifier" - RELEASE=$(date +%Y%m%d) -} - -if [ -f "${RELEASE_NOTES}" ]; then - # Try to extract version from release notes (first line usually contains version info) - RELEASE=$(head -1 "${RELEASE_NOTES}" | grep -oE '[0-9]+\.[0-9]+' | head -1 | tr -d '.') -fi - -if [ -z "${RELEASE}" ]; then - RELEASE=$(date +%Y%m%d) - echo "Using date as release identifier: ${RELEASE}" -else - echo "RNAcentral release: ${RELEASE}" -fi - -# Download RNAcentral FASTA file -if [ "${DB_SELECTION}" = "all" ]; then - # Download complete active database - FASTA_FILE="rnacentral_active.fasta.gz" - DB_NAME="rnacentral" - echo "Downloading RNAcentral active sequences (~8.4G)..." - echo " Contains sequences currently present in at least one expert database" - echo " Uses standard URS IDs (e.g., URS000149A9AF)" - echo " ⭐ MATCHES the online RNAcentral API database - ensures consistency" - FASTA_URL="${RNACENTRAL_SEQUENCES_URL}/${FASTA_FILE}" - IS_COMPRESSED=true -else - # Download specific database subset - DB_NAME="${DB_SELECTION}" - FASTA_FILE="${DB_SELECTION}.fasta" - echo "Downloading RNAcentral database subset: ${DB_SELECTION}" - echo " This is a subset of the active database from a specific expert database" - echo " File: ${FASTA_FILE}" - FASTA_URL="${RNACENTRAL_BY_DB_URL}/${FASTA_FILE}" - IS_COMPRESSED=false - - # Check if database exists - if ! curl -s -o /dev/null -w "%{http_code}" "${FASTA_URL}" | grep -q "200"; then - echo "Error: Database '${DB_SELECTION}' not found" - echo "Run '$0 list' to see available databases" - exit 1 - fi -fi - -echo "Downloading from: ${FASTA_URL}" -echo "This may take a while depending on your internet connection..." -if [ "${DB_SELECTION}" = "all" ]; then - echo "File size is approximately 8-9GB, please be patient..." -else - echo "Downloading database subset..." -fi -wget -c --progress=bar:force "${FASTA_URL}" 2>&1 || { - echo "Error: Failed to download RNAcentral FASTA file" - echo "Please check your internet connection and try again" - echo "You can also try downloading manually from: ${FASTA_URL}" - exit 1 -} - -if [ ! -f "${FASTA_FILE}" ]; then - echo "Error: Downloaded file not found" - exit 1 -fi - -cd .. - -# Create release directory -if [ "${DB_SELECTION}" = "all" ]; then - OUTPUT_DIR="rnacentral_${RELEASE}" -else - OUTPUT_DIR="rnacentral_${DB_NAME}_${RELEASE}" -fi -mkdir -p ${OUTPUT_DIR} -mv ${DOWNLOAD_TMP}/* ${OUTPUT_DIR}/ 2>/dev/null || true -rmdir ${DOWNLOAD_TMP} 2>/dev/null || true - -cd ${OUTPUT_DIR} - -# Extract FASTA file if compressed -echo "Preparing RNAcentral sequences..." -if [ -f "${FASTA_FILE}" ]; then - if [ "${IS_COMPRESSED}" = "true" ]; then - echo "Decompressing ${FASTA_FILE}..." - OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" - gunzip -c "${FASTA_FILE}" > "${OUTPUT_FASTA}" || { - echo "Error: Failed to decompress FASTA file" - exit 1 - } - # Optionally remove the compressed file to save space - # rm "${FASTA_FILE}" - else - # File is not compressed, just copy/rename - OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" - cp "${FASTA_FILE}" "${OUTPUT_FASTA}" || { - echo "Error: Failed to copy FASTA file" - exit 1 - } - fi -else - echo "Error: FASTA file not found" - exit 1 -fi - -# Check if we have sequences -if [ ! -s "${OUTPUT_FASTA}" ]; then - echo "Error: FASTA file is empty" - exit 1 -fi - -# Get file size for user information -FILE_SIZE=$(du -h "${OUTPUT_FASTA}" | cut -f1) -echo "FASTA file size: ${FILE_SIZE}" - -echo "Creating BLAST database..." -# Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide) -# Note: RNAcentral uses RNAcentral IDs (URS...) as sequence identifiers, -# which matches the format expected by the RNACentralSearch class -DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" -makeblastdb -in "${OUTPUT_FASTA}" \ - -out "${DB_OUTPUT_NAME}" \ - -dbtype nucl \ - -parse_seqids \ - -title "RNAcentral_${DB_NAME}_${RELEASE}" - -echo "" -echo "BLAST database created successfully!" -echo "Database location: $(pwd)/${DB_OUTPUT_NAME}" -echo "" -echo "To use this database, set in your config (search_rna_config.yaml):" -echo " rnacentral_params:" -echo " use_local_blast: true" -echo " local_blast_db: $(pwd)/${DB_OUTPUT_NAME}" -echo "" -echo "Note: The database files are:" -ls -lh ${DB_OUTPUT_NAME}.* | head -5 -echo "" -if [ "${DB_SELECTION}" = "all" ]; then - echo "This database uses RNAcentral IDs (URS...), which matches the online" - echo "RNAcentral search API, ensuring consistent results between local and online searches." -else - echo "This is a subset database from ${DB_SELECTION} expert database." - echo "For full coverage matching online API, use 'all' option." -fi - -cd .. - diff --git a/examples/search/search_dna.sh b/examples/search/search_dna.sh deleted file mode 100644 index d3c0d6ec..00000000 --- a/examples/search/search_dna.sh +++ /dev/null @@ -1,2 +0,0 @@ -python3 -m graphgen.run \ ---config_file graphgen/configs/search_dna_config.yaml diff --git a/examples/search/search_dna/README.md b/examples/search/search_dna/README.md new file mode 100644 index 00000000..f4e8be7a --- /dev/null +++ b/examples/search/search_dna/README.md @@ -0,0 +1,84 @@ +# Search DNA Sequences + +This example demonstrates how to search DNA sequences from NCBI RefSeq database using BLAST. + +## Overview + +The DNA search pipeline reads DNA sequence queries and searches against NCBI RefSeq database to find similar sequences and retrieve associated metadata. + +## Quick Start + +### 1. Build Local BLAST Database (Optional) + +If you want to use local BLAST for faster searches, first build the database: + +```bash +./build_db.sh [human_mouse_drosophila_yeast|representative|complete|all] +``` + +Options: +- `human_mouse_drosophila_yeast`: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae sequences (minimal, smallest) +- `representative`: Download genomic sequences from major categories (recommended, smaller) +- `complete`: Download all complete genomic sequences from complete/ directory (very large) +- `all`: Download all genomic sequences from all categories (very large) + +The script will create a BLAST database in `refseq_${RELEASE}/` directory. + +### 2. Configure Search Parameters + +Edit `search_dna_config.yaml` to set: + +- **Input file path**: Set the path to your DNA sequence queries +- **NCBI parameters**: + - `email`: Your email address (required by NCBI) + - `tool`: Tool name for NCBI API + - `use_local_blast`: Set to `true` if you have a local BLAST database + - `local_blast_db`: Path to your local BLAST database (without .nhr extension) + +Example configuration: +```yaml +input_path: + - examples/input_examples/search_dna_demo.jsonl + +data_sources: [ncbi] +ncbi_params: + email: your_email@example.com # Required! + tool: GraphGen + use_local_blast: true + local_blast_db: refseq_release/refseq_release +``` + +### 3. Run the Search + +```bash +./search_dna.sh +``` + +Or run directly with Python: + +```bash +python3 -m graphgen.run \ + --config_file examples/search/search_dna/search_dna_config.yaml \ + --output_dir cache/ +``` + +## Input Format + +The input file should be in JSONL format with DNA sequence queries: + +```jsonl +{"type": "text", "content": "BRCA1"} +{"type": "text", "content": ">query\nATGCGATCG..."} +{"type": "text", "content": "ATGCGATCG..."} +``` + +## Output + +The search results will be saved in the output directory with matched sequences and metadata from NCBI RefSeq. + +## Notes + +- **NCBI requires an email address** - Make sure to set `email` in `ncbi_params` +- **Local BLAST** provides faster searches and doesn't require internet connection during search +- The local BLAST database can be very large (several GB to TB depending on the download type) +- Adjust `max_concurrent` based on your system resources and API rate limits diff --git a/examples/search/build_db/build_dna_blast_db.sh b/examples/search/search_dna/build_db.sh similarity index 55% rename from examples/search/build_db/build_dna_blast_db.sh rename to examples/search/search_dna/build_db.sh index 1928d7d0..969ebbac 100755 --- a/examples/search/build_db/build_dna_blast_db.sh +++ b/examples/search/search_dna/build_db.sh @@ -24,8 +24,8 @@ set -e # - {category}.{number}.genomic.fna.gz (基因组序列) # - {category}.{number}.rna.fna.gz (RNA序列) # -# Usage: ./build_dna_blast_db.sh [human_mouse|representative|complete|all] -# human_mouse: Download only Homo sapiens and Mus musculus sequences (minimal, smallest) +# Usage: ./build_dna_blast_db.sh [human_mouse_drosophila_yeast|representative|complete|all] +# human_mouse_drosophila_yeast: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae sequences (minimal, smallest) # representative: Download genomic sequences from major categories (recommended, smaller) # Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi # complete: Download all complete genomic sequences from complete/ directory (very large) @@ -36,7 +36,7 @@ set -e # For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ # Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ -DOWNLOAD_TYPE=${1:-human_mouse} +DOWNLOAD_TYPE=${1:-human_mouse_drosophila_yeast} # Better to use a stable DOWNLOAD_TMP name to support resuming downloads DOWNLOAD_TMP=_downloading_dna @@ -58,17 +58,35 @@ else echo "Using date as release identifier: ${RELEASE}" fi -# Function to check if a file contains target species -check_file_for_species() { - local url=$1 - local filename=$2 - local temp_file="/tmp/check_${filename//\//_}" + # First check if file is already downloaded locally + if check_file_downloaded "${filename}"; then + # File already exists, check if it contains target species + # Check both compressed and decompressed versions + local decompressed_file="${filename%.gz}" + if [ -f "${filename}" ]; then + # Compressed file exists + if gunzip -c "${filename}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then + return 0 # Contains target species + else + return 1 # Does not contain target species + fi + elif [ -f "${decompressed_file}" ]; then + # Decompressed file exists + if head -2000 "${decompressed_file}" 2>/dev/null | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then + return 0 # Contains target species + else + return 1 # Does not contain target species + fi + fi + fi + # File not downloaded yet, download first 500KB to check # Download first 500KB (enough to get many sequence headers) # This should be sufficient to identify the species in most cases if curl -s --max-time 30 --range 0-512000 "${url}" -o "${temp_file}" 2>/dev/null && [ -s "${temp_file}" ]; then # Try to decompress and check for species names - if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus)"; then + # Check for: Homo sapiens (人), Mus musculus (小鼠), Drosophila melanogaster (果蝇), Saccharomyces cerevisiae (酵母) + if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then rm -f "${temp_file}" return 0 # Contains target species else @@ -84,39 +102,57 @@ check_file_for_species() { # Download based on type case ${DOWNLOAD_TYPE} in - human_mouse) - echo "Downloading RefSeq sequences for Homo sapiens and Mus musculus only (minimal size)..." - echo "This will check each file to see if it contains human or mouse sequences..." - category="vertebrate_mammalian" - echo "Checking files in ${category} category..." - - # Get list of files and save to temp file to avoid subshell issues - curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ - grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ - sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files.txt + human_mouse_drosophila_yeast) + echo "Downloading RefSeq sequences for Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal size)..." + echo "This will check each file to see if it contains target species sequences..." - file_count=0 - download_count=0 + # Check multiple categories: vertebrate_mammalian (人、小鼠), invertebrate (果蝇), fungi (酵母) + categories="vertebrate_mammalian invertebrate fungi" + total_file_count=0 + total_download_count=0 - while read filename; do - file_count=$((file_count + 1)) - url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" - echo -n "[${file_count}] Checking ${filename}... " + for category in ${categories}; do + echo "Checking files in ${category} category..." - if check_file_for_species "${url}" "${filename}"; then - echo "✓ contains target species, downloading..." - download_count=$((download_count + 1)) - wget -c -q --show-progress "${url}" || { - echo "Warning: Failed to download ${filename}" - } - else - echo "✗ skipping (no human/mouse data)" - fi - done < /tmp/refseq_files.txt + # Get list of files and save to temp file to avoid subshell issues + curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ + grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt + + file_count=0 + download_count=0 + + while read filename; do + file_count=$((file_count + 1)) + total_file_count=$((total_file_count + 1)) + url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" + echo -n "[${total_file_count}] Checking ${category}/${filename}... " + + if check_file_for_species "${url}" "${filename}"; then + # Check if file is already downloaded + if check_file_downloaded "${filename}"; then + echo "✓ already downloaded (contains target species)" + download_count=$((download_count + 1)) + total_download_count=$((total_download_count + 1)) + else + echo "✓ contains target species, downloading..." + download_count=$((download_count + 1)) + total_download_count=$((total_download_count + 1)) + wget -c -q --show-progress "${url}" || { + echo "Warning: Failed to download ${filename}" + } + fi + else + echo "✗ skipping (no target species data)" + fi + done < /tmp/refseq_files_${category}.txt + + rm -f /tmp/refseq_files_${category}.txt + echo " ${category}: Checked ${file_count} files, downloaded ${download_count} files." + done - rm -f /tmp/refseq_files.txt echo "" - echo "Summary: Checked ${file_count} files, downloaded ${download_count} files containing human or mouse sequences." + echo "Summary: Checked ${total_file_count} files total, downloaded ${total_download_count} files containing target species (human, mouse, fruit fly, yeast)." ;; representative) echo "Downloading RefSeq representative sequences (recommended, smaller size)..." @@ -124,52 +160,76 @@ case ${DOWNLOAD_TYPE} in # Note: You can modify this list based on your specific requirements for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi; do echo "Downloading ${category} sequences..." + # Get list of files and save to temp file to avoid subshell issues curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ - sed 's/href="\(.*\)"/\1/' | \ - while read filename; do + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt + + while read filename; do + if check_file_downloaded "${filename}"; then + echo " ✓ ${filename} already downloaded, skipping..." + else echo " Downloading ${filename}..." wget -c -q --show-progress \ "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { echo "Warning: Failed to download ${filename}" } - done + fi + done < /tmp/refseq_files_${category}.txt + + rm -f /tmp/refseq_files_${category}.txt done ;; complete) echo "Downloading RefSeq complete genomic sequences (WARNING: very large, may take hours)..." + # Get list of files and save to temp file to avoid subshell issues curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \ grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ - sed 's/href="\(.*\)"/\1/' | \ - while read filename; do + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_complete.txt + + while read filename; do + if check_file_downloaded "${filename}"; then + echo " ✓ ${filename} already downloaded, skipping..." + else echo " Downloading ${filename}..." wget -c -q --show-progress \ "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename}" || { echo "Warning: Failed to download ${filename}" } - done + fi + done < /tmp/refseq_files_complete.txt + + rm -f /tmp/refseq_files_complete.txt ;; all) echo "Downloading all RefSeq genomic sequences from all categories (WARNING: extremely large, may take many hours)..." # Download genomic sequences from all categories for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do echo "Downloading ${category} genomic sequences..." + # Get list of files and save to temp file to avoid subshell issues curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \ grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \ - sed 's/href="\(.*\)"/\1/' | \ - while read filename; do + sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt + + while read filename; do + if check_file_downloaded "${filename}"; then + echo " ✓ ${filename} already downloaded, skipping..." + else echo " Downloading ${filename}..." wget -c -q --show-progress \ "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || { echo "Warning: Failed to download ${filename}" } - done + fi + done < /tmp/refseq_files_${category}.txt + + rm -f /tmp/refseq_files_${category}.txt done ;; *) echo "Error: Unknown download type '${DOWNLOAD_TYPE}'" - echo "Usage: $0 [human_mouse|representative|complete|all]" - echo " human_mouse: Download only Homo sapiens and Mus musculus (minimal)" + echo "Usage: $0 [human_mouse_drosophila_yeast|representative|complete|all]" + echo " human_mouse_drosophila_yeast: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal)" echo " representative: Download major categories (recommended)" echo " complete: Download all complete genomic sequences (very large)" echo " all: Download all genomic sequences (extremely large)" diff --git a/examples/search/search_dna/search_dna.sh b/examples/search/search_dna/search_dna.sh new file mode 100644 index 00000000..ef51281d --- /dev/null +++ b/examples/search/search_dna/search_dna.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.run \ +--config_file examples/search/search_dna/search_dna_config.yaml + diff --git a/examples/search/search_dna/search_dna_config.yaml b/examples/search/search_dna/search_dna_config.yaml new file mode 100644 index 00000000..adbe7e1c --- /dev/null +++ b/examples/search/search_dna/search_dna_config.yaml @@ -0,0 +1,29 @@ +global_params: + working_dir: cache + +nodes: + - id: read_step + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples + + - id: search_step + op_name: search + type: map_batch + dependencies: + - read_step # search_step depends on read_step + execution_params: + replicas: 1 + batch_size: 10 + params: + data_sources: [ncbi] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral + ncbi_params: + email: test@example.com # NCBI requires an email address + tool: GraphGen # tool name for NCBI API + use_local_blast: true # whether to use local blast for DNA search + local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension) + + diff --git a/examples/search/search_dna_config.yaml b/examples/search/search_dna_config.yaml deleted file mode 100644 index f53a5eb8..00000000 --- a/examples/search/search_dna_config.yaml +++ /dev/null @@ -1,17 +0,0 @@ -pipeline: - - name: read_step - op_key: read - params: - input_file: resources/input_examples/search_dna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples - - - name: search_step - op_key: search - deps: [read_step] # search_step depends on read_step - params: - data_sources: [ncbi] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral - ncbi_params: - email: test@example.com # NCBI requires an email address - tool: GraphGen # tool name for NCBI API - use_local_blast: true # whether to use local blast for DNA search - local_blast_db: refseq_release/refseq_release # path to local BLAST database (without .nhr extension) - diff --git a/examples/search/search_protein/README.md b/examples/search/search_protein/README.md new file mode 100644 index 00000000..650a8c69 --- /dev/null +++ b/examples/search/search_protein/README.md @@ -0,0 +1,80 @@ +# Search Protein Sequences + +This example demonstrates how to search protein sequences from UniProt database using BLAST. + +## Overview + +The protein search pipeline reads protein sequence queries and searches against UniProt database to find similar sequences and retrieve associated metadata. + +## Quick Start + +### 1. Build Local BLAST Database (Optional) + +If you want to use local BLAST for faster searches, first build the database: + +```bash +./build_db.sh +``` + +The script will download UniProt Swiss-Prot database and create a BLAST database. You can configure the download mode: +- `sprot` (default): Download only Swiss-Prot (high quality, curated) +- `full`: Download both Swiss-Prot and TrEMBL (complete database) + +The script will create a BLAST database in `${RELEASE}/` directory. + +### 2. Configure Search Parameters + +Edit `search_protein_config.yaml` to set: + +- **Input file path**: Set the path to your protein sequence queries +- **UniProt parameters**: + - `use_local_blast`: Set to `true` if you have a local BLAST database + - `local_blast_db`: Path to your local BLAST database (format: `/path/to/${RELEASE}/uniprot_sprot`) + +Example configuration: +```yaml +input_path: + - examples/input_examples/search_protein_demo.jsonl + +data_sources: [uniprot] +uniprot_params: + use_local_blast: true + local_blast_db: /your_path/2024_01/uniprot_sprot + # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database) +``` + +### 3. Run the Search + +```bash +./search_uniprot.sh +``` + +Or run directly with Python: + +```bash +python3 -m graphgen.run \ + --config_file examples/search/search_protein/search_protein_config.yaml \ + --output_dir cache/ +``` + +## Input Format + +The input file should be in JSONL format with protein sequence queries: + +```jsonl +{"type": "text", "content": "P01308"} +{"type": "text", "content": "insulin"} +{"type": "text", "content": "MHHHHHHSSGVDLGTENLYFQSNAMDFPQQLEACVKQANQALSRFIAPLPFQNTPVVETMQYGALLGGKRLRPFLVYATGHMFGVSTNTLDAPAAAVECIHAYSLIHDDLPAMDDDDLRRGLPTCHVKFGEANAILAGDALQTLAFSILSDANMPEVSDRDRISMISELASASGIAGMCGGQALDLDAEGKHVPLDALERIHRHKTGALIRAAVRLGALSAGDKGRRALPVLDKYAESIGLAFQVQDDILDVVGDTATLGKRQGADQQLGKSTYPALLGLEQARKKARDLIDDARQALKQLAEQSLDTSALEALADYIIQRNK"} +``` + +## Output + +The search results will be saved in the output directory with matched sequences and metadata from UniProt. + +## Notes + +- **Local BLAST** provides faster searches and doesn't require internet connection during search +- **Swiss-Prot** is recommended for high-quality, curated protein sequences +- **TrEMBL** contains automatically annotated sequences (larger database) +- The merged database (`uniprot_${RELEASE}`) contains both Swiss-Prot and TrEMBL +- Adjust `max_concurrent` based on your system resources and API rate limits diff --git a/examples/search/search_protein/build_db.sh b/examples/search/search_protein/build_db.sh new file mode 100755 index 00000000..da4c2b4b --- /dev/null +++ b/examples/search/search_protein/build_db.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +set -e + +# Downloads the latest release of UniProt, putting it in a release-specific directory. +# Creates associated BLAST databases. +# We need makeblastdb on our PATH +# For Ubuntu/Debian: sudo apt install ncbi-blast+ +# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ +# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ + +echo "Downloading RELEASE.metalink..." +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/RELEASE.metalink" + +# Extract the release name (like 2017_10 or 2017_1) +# Use sed for cross-platform compatibility (works on both macOS and Linux) +RELEASE=$(sed -n 's/.*\([0-9]\{4\}_[0-9]\{1,2\}\)<\/version>.*/\1/p' RELEASE.metalink | head -1) + +echo "UniProt release: ${RELEASE}" +echo "" + +# Download Swiss-Prot (always needed) +echo "Downloading uniprot_sprot.fasta.gz..." +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" + +# Download TrEMBL only if full mode +if [ "${DOWNLOAD_MODE}" = "full" ]; then + echo "Downloading uniprot_trembl.fasta.gz..." + wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" +fi + +# Download metadata files +echo "Downloading metadata files..." +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/reldate.txt" +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/README" +wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/LICENSE" + +cd .. + +mkdir -p ${RELEASE} +mv ${DOWNLOAD_TMP}/* ${RELEASE} +rmdir ${DOWNLOAD_TMP} + +cd ${RELEASE} + +echo "" +echo "Extracting files..." +gunzip uniprot_sprot.fasta.gz + +if [ "${DOWNLOAD_MODE}" = "full" ]; then + gunzip uniprot_trembl.fasta.gz + echo "Merging Swiss-Prot and TrEMBL..." + cat uniprot_sprot.fasta uniprot_trembl.fasta >uniprot_${RELEASE}.fasta +fi + +echo "" +echo "Building BLAST databases..." + +# Always build Swiss-Prot database +makeblastdb -in uniprot_sprot.fasta -out uniprot_sprot -dbtype prot -parse_seqids -title uniprot_sprot + +# Build full release database only if in full mode +if [ "${DOWNLOAD_MODE}" = "full" ]; then + makeblastdb -in uniprot_${RELEASE}.fasta -out uniprot_${RELEASE} -dbtype prot -parse_seqids -title uniprot_${RELEASE} + makeblastdb -in uniprot_trembl.fasta -out uniprot_trembl -dbtype prot -parse_seqids -title uniprot_trembl +fi + +cd .. + +echo "" +echo "BLAST databases created successfully!" +echo "Database locations:" +if [ "${DOWNLOAD_MODE}" = "sprot" ]; then + echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot" + echo "" + echo "To use this database, set in your config:" + echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot" +else + echo " - Combined: $(pwd)/${RELEASE}/uniprot_${RELEASE}" + echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot" + echo " - TrEMBL: $(pwd)/${RELEASE}/uniprot_trembl" + echo "" + echo "To use these databases, set in your config:" + echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot # or uniprot_${RELEASE} or uniprot_trembl" +fi + diff --git a/examples/search/search_protein/search_protein_config.yaml b/examples/search/search_protein/search_protein_config.yaml new file mode 100644 index 00000000..f73a4514 --- /dev/null +++ b/examples/search/search_protein/search_protein_config.yaml @@ -0,0 +1,27 @@ +global_params: + working_dir: cache + +nodes: + - id: read_step + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples + + - id: search_step + op_name: search + type: map_batch + dependencies: + - read_step # search_step depends on read_step + execution_params: + replicas: 1 + batch_size: 10 + params: + data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot + uniprot_params: + use_local_blast: true # whether to use local blast for uniprot search + local_blast_db: /your_path/2024_01/uniprot_sprot # format: /path/to/${RELEASE}/uniprot_sprot + # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database) + diff --git a/examples/search/search_protein/search_uniprot.sh b/examples/search/search_protein/search_uniprot.sh new file mode 100644 index 00000000..627735a0 --- /dev/null +++ b/examples/search/search_protein/search_uniprot.sh @@ -0,0 +1,2 @@ +python3 -m graphgen.run \ +--config_file examples/search/search_protein/search_protein_config.yaml diff --git a/examples/search/search_protein_config.yaml b/examples/search/search_protein_config.yaml deleted file mode 100644 index bfbf84eb..00000000 --- a/examples/search/search_protein_config.yaml +++ /dev/null @@ -1,15 +0,0 @@ -pipeline: - - name: read_step - op_key: read - params: - input_file: resources/input_examples/search_protein_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples - - - name: search_step - op_key: search - deps: [read_step] # search_step depends on read_step - params: - data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot - uniprot_params: - use_local_blast: true # whether to use local blast for uniprot search - local_blast_db: /your_path/2024_01/uniprot_sprot # format: /path/to/${RELEASE}/uniprot_sprot - # options: uniprot_sprot (recommended, high quality), uniprot_trembl, or uniprot_${RELEASE} (merged database) diff --git a/examples/search/search_rna.sh b/examples/search/search_rna.sh deleted file mode 100644 index 9243d6be..00000000 --- a/examples/search/search_rna.sh +++ /dev/null @@ -1,2 +0,0 @@ -python3 -m graphgen.run \ ---config_file graphgen/configs/search_rna_config.yaml diff --git a/examples/search/search_rna/README.md b/examples/search/search_rna/README.md new file mode 100644 index 00000000..2a59fed6 --- /dev/null +++ b/examples/search/search_rna/README.md @@ -0,0 +1,80 @@ +# Search RNA Sequences + +This example demonstrates how to search RNA sequences from RNAcentral database using BLAST. + +## Overview + +The RNA search pipeline reads RNA sequence queries and searches against RNAcentral database to find similar sequences and retrieve associated metadata. + +## Quick Start + +### 1. Build Local BLAST Database (Optional) + +If you want to use local BLAST for faster searches, first build the database: + +```bash +./build_db.sh [all|list|selected|database_name...] +``` + +Options: +- `all`: Download complete active database (~8.4G compressed) +- `list`: List all available database subsets +- `selected`: Download predefined database subsets (ensembl_gencode, mirbase, gtrnadb, refseq, lncbase, rfam) +- `database_name`: Download specific database subset (e.g., refseq, rfam, mirbase) + +The script will create a BLAST database in `rnacentral_${RELEASE}/` or `rnacentral_${DB_NAME}_${RELEASE}/` directory. + +### 2. Configure Search Parameters + +Edit `search_rna_config.yaml` to set: + +- **Input file path**: Set the path to your RNA sequence queries +- **RNAcentral parameters**: + - `use_local_blast`: Set to `true` if you have a local BLAST database + - `local_blast_db`: Path to your local BLAST database (without .nhr extension) + +Example configuration: +```yaml +input_path: + - examples/input_examples/search_rna_demo.jsonl + +data_sources: [rnacentral] +rnacentral_params: + use_local_blast: true + local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD +``` + +### 3. Run the Search + +```bash +./search_rna.sh +``` + +Or run directly with Python: + +```bash +python3 -m graphgen.run \ + --config_file examples/search/search_rna/search_rna_config.yaml \ + --output_dir cache/ +``` + +## Input Format + +The input file should be in JSONL format with RNA sequence queries: + +```jsonl +{"type": "text", "content": "miR-21"} +{"type": "text", "content": ">query\nAUGCAUGC..."} +{"type": "text", "content": "AUGCAUGC..."} +``` + +## Output + +The search results will be saved in the output directory with matched sequences and metadata from RNAcentral. + +## Notes + +- **Local BLAST** provides faster searches and doesn't require internet connection during search +- The complete RNAcentral database is large (~8.4G compressed), consider using specific database subsets for smaller downloads +- RNAcentral uses URS IDs (e.g., URS000149A9AF) which match the online RNAcentral API database +- Adjust `max_concurrent` based on your system resources and API rate limits diff --git a/examples/search/search_rna/build_db.sh b/examples/search/search_rna/build_db.sh new file mode 100755 index 00000000..af688ac1 --- /dev/null +++ b/examples/search/search_rna/build_db.sh @@ -0,0 +1,433 @@ +#!/bin/bash + +set -e + +# Downloads RNAcentral sequences and creates BLAST databases. +# This script downloads the RNAcentral active database, which is the same +# data source used for online RNAcentral searches, ensuring consistency +# between local and online search results. +# +# RNAcentral is a comprehensive database of non-coding RNA sequences that +# integrates data from multiple expert databases including RefSeq, Rfam, etc. +# +# Usage: ./build_rna_blast_db.sh [all|list|selected|database_name...] +# all (default): Download complete active database (~8.4G compressed) +# list: List all available database subsets +# selected: Download predefined database subsets (ensembl_gencode, mirbase, gtrnadb, refseq, lncbase) +# database_name: Download specific database subset (e.g., refseq, rfam, mirbase) +# database_name1 database_name2 ...: Download multiple database subsets +# +# Available database subsets (examples): +# - refseq.fasta (~98M): RefSeq RNA sequences +# - rfam.fasta (~1.5G): Rfam RNA families +# - mirbase.fasta (~10M): microRNA sequences +# - ensembl_gencode.fasta (~337M): Ensembl/GENCODE annotations (human) +# - gtrnadb.fasta (~38M): tRNA sequences +# - lncbase.fasta (~106K): Human lncRNA database +# - See "list" option for complete list +# +# The complete "active" database contains all sequences from all expert databases. +# Using a specific database subset provides a smaller, focused database. +# +# We need makeblastdb on our PATH +# For Ubuntu/Debian: sudo apt install ncbi-blast+ +# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+ +# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ + +# RNAcentral base URL (using EBI HTTPS) +# NOTE: RNAcentral only has one official mirror at EBI +RNACENTRAL_BASE="https://ftp.ebi.ac.uk/pub/databases/RNAcentral" +RNACENTRAL_RELEASE_URL="${RNACENTRAL_BASE}/current_release" +RNACENTRAL_SEQUENCES_URL="${RNACENTRAL_RELEASE_URL}/sequences" +RNACENTRAL_BY_DB_URL="${RNACENTRAL_SEQUENCES_URL}/by-database" + +# Parse command line arguments +DB_SELECTION=${1:-selected} + +# Predefined database list for "selected" option +SELECTED_DATABASES=("ensembl_gencode" "mirbase" "gtrnadb" "refseq" "lncbase" "rfam") + +# List available databases if requested +if [ "${DB_SELECTION}" = "list" ]; then + echo "Available RNAcentral database subsets:" + echo "" + echo "Fetching list from RNAcentral..." + listing=$(curl -s "${RNACENTRAL_BY_DB_URL}/") + echo "${listing}" | \ + grep -oE '' | \ + sed 's///' | \ + sort | \ + while read db; do + size=$(echo "${listing}" | grep -A 1 "${db}" | grep -oE '[0-9.]+[GMK]' | head -1 || echo "unknown") + echo " - ${db%.fasta}: ${size}" + done + echo "" + echo "Usage: $0 [all|list|selected|database_name...]" + echo " Example: $0 refseq # Download only RefSeq sequences (~98M)" + echo " Example: $0 rfam # Download only Rfam sequences (~1.5G)" + echo " Example: $0 selected # Download predefined databases (ensembl_gencode, mirbase, gtrnadb, refseq, lncbase, rfam)" + echo " Example: $0 refseq mirbase # Download multiple databases" + echo " Example: $0 all # Download complete active database (~8.4G)" + exit 0 +fi + +# Determine which databases to download +if [ "${DB_SELECTION}" = "selected" ]; then + # Use predefined database list + DATABASES=("${SELECTED_DATABASES[@]}") + echo "Downloading selected databases: ${DATABASES[*]}" +elif [ "${DB_SELECTION}" = "all" ]; then + # Single database mode (all) + DATABASES=("all") +else + # Multiple databases provided as arguments + DATABASES=("$@") +fi + +# Get RNAcentral release version from release notes (once for all databases) +echo "Getting RNAcentral release information..." +RELEASE_NOTES_URL="${RNACENTRAL_RELEASE_URL}/release_notes.txt" +RELEASE_NOTES_TMP=$(mktemp) +wget -q "${RELEASE_NOTES_URL}" -O "${RELEASE_NOTES_TMP}" 2>/dev/null || { + echo "Warning: Could not download release notes, using current date as release identifier" + RELEASE=$(date +%Y%m%d) +} + +if [ -f "${RELEASE_NOTES_TMP}" ] && [ -s "${RELEASE_NOTES_TMP}" ]; then + # Try to extract version from release notes (first line usually contains version info) + RELEASE=$(head -1 "${RELEASE_NOTES_TMP}" | grep -oE '[0-9]+\.[0-9]+' | head -1 | tr -d '.') + rm -f "${RELEASE_NOTES_TMP}" +fi + +if [ -z "${RELEASE}" ]; then + RELEASE=$(date +%Y%m%d) + echo "Using date as release identifier: ${RELEASE}" +else + echo "RNAcentral release: ${RELEASE}" +fi + +# Process each database +DB_COUNT=${#DATABASES[@]} +DB_INDEX=0 + +for DB_SELECTION in "${DATABASES[@]}"; do + DB_INDEX=$((DB_INDEX + 1)) + echo "" + echo "==========================================" + echo "Processing database ${DB_INDEX}/${DB_COUNT}: ${DB_SELECTION}" + echo "==========================================" + echo "" + + # Check if database already exists and is complete + # First check with current release version + if [ "${DB_SELECTION}" = "all" ]; then + OUTPUT_DIR="rnacentral_${RELEASE}" + DB_NAME="rnacentral" + DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" + else + OUTPUT_DIR="rnacentral_${DB_SELECTION}_${RELEASE}" + DB_NAME="${DB_SELECTION}" + DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" + fi + + # Check if BLAST database already exists with current release + if [ -d "${OUTPUT_DIR}" ] && [ -f "${OUTPUT_DIR}/${DB_OUTPUT_NAME}.nhr" ] && [ -f "${OUTPUT_DIR}/${DB_OUTPUT_NAME}.nin" ]; then + echo "✓ Database ${DB_SELECTION} already exists and appears complete: ${OUTPUT_DIR}/" + echo " BLAST database: ${OUTPUT_DIR}/${DB_OUTPUT_NAME}" + echo " Skipping download and database creation..." + continue + fi + + # Also check for any existing version of this database (e.g., different release dates) + EXISTING_DIR=$(ls -d rnacentral_${DB_SELECTION}_* 2>/dev/null | head -1) + if [ -n "${EXISTING_DIR}" ] && [ "${DB_SELECTION}" != "all" ]; then + EXISTING_DB_NAME=$(basename "${EXISTING_DIR}" | sed "s/rnacentral_${DB_SELECTION}_//") + if [ -f "${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}.nhr" ] && [ -f "${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}.nin" ]; then + echo "✓ Database ${DB_SELECTION} already exists (version ${EXISTING_DB_NAME}): ${EXISTING_DIR}/" + echo " BLAST database: ${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_DB_NAME}" + echo " Skipping download and database creation..." + echo " Note: Using existing version ${EXISTING_DB_NAME} instead of ${RELEASE}" + continue + fi + fi + + # Better to use a stable DOWNLOAD_TMP name to support resuming downloads + DOWNLOAD_TMP="_downloading_rnacentral_${DB_SELECTION}" + mkdir -p ${DOWNLOAD_TMP} + cd ${DOWNLOAD_TMP} + + # Download RNAcentral FASTA file + if [ "${DB_SELECTION}" = "all" ]; then + # Download complete active database + FASTA_FILE="rnacentral_active.fasta.gz" + DB_NAME="rnacentral" + echo "Downloading RNAcentral active sequences (~8.4G)..." + echo " Contains sequences currently present in at least one expert database" + echo " Uses standard URS IDs (e.g., URS000149A9AF)" + echo " ⭐ MATCHES the online RNAcentral API database - ensures consistency" + FASTA_URL="${RNACENTRAL_SEQUENCES_URL}/${FASTA_FILE}" + IS_COMPRESSED=true + else + # Download specific database subset + DB_NAME="${DB_SELECTION}" + FASTA_FILE="${DB_SELECTION}.fasta" + echo "Downloading RNAcentral database subset: ${DB_SELECTION}" + echo " This is a subset of the active database from a specific expert database" + echo " File: ${FASTA_FILE}" + FASTA_URL="${RNACENTRAL_BY_DB_URL}/${FASTA_FILE}" + IS_COMPRESSED=false + + # Check if database exists (use HTTP status code check for HTTPS) + HTTP_CODE=$(curl -s --max-time 10 -o /dev/null -w "%{http_code}" "${FASTA_URL}" 2>/dev/null | tail -1 || echo "000") + if ! echo "${HTTP_CODE}" | grep -q "^200$"; then + echo "Error: Database '${DB_SELECTION}' not found (HTTP code: ${HTTP_CODE})" + echo "Run '$0 list' to see available databases" + cd .. + rm -rf ${DOWNLOAD_TMP} + exit 1 + fi + fi + + echo "Downloading from: ${FASTA_URL}" + echo "This may take a while depending on your internet connection..." + if [ "${DB_SELECTION}" = "all" ]; then + echo "File size is approximately 8-9GB, please be patient..." + else + echo "Downloading database subset..." + fi + + wget -c "${FASTA_URL}" || { + echo "Error: Failed to download RNAcentral FASTA file" + echo "Please check your internet connection and try again" + echo "URL: ${FASTA_URL}" + cd .. + rm -rf ${DOWNLOAD_TMP} + exit 1 + } + + if [ ! -f "${FASTA_FILE}" ]; then + echo "Error: Downloaded file not found" + cd .. + rm -rf ${DOWNLOAD_TMP} + exit 1 + fi + + cd .. + + # Create release directory + if [ "${DB_SELECTION}" = "all" ]; then + OUTPUT_DIR="rnacentral_${RELEASE}" + else + OUTPUT_DIR="rnacentral_${DB_NAME}_${RELEASE}" + fi + mkdir -p ${OUTPUT_DIR} + mv ${DOWNLOAD_TMP}/* ${OUTPUT_DIR}/ 2>/dev/null || true + rmdir ${DOWNLOAD_TMP} 2>/dev/null || true + + cd ${OUTPUT_DIR} + + # Extract FASTA file if compressed + echo "Preparing RNAcentral sequences..." + if [ -f "${FASTA_FILE}" ]; then + if [ "${IS_COMPRESSED}" = "true" ]; then + echo "Decompressing ${FASTA_FILE}..." + OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" + gunzip -c "${FASTA_FILE}" > "${OUTPUT_FASTA}" || { + echo "Error: Failed to decompress FASTA file" + cd .. + exit 1 + } + # Optionally remove the compressed file to save space + # rm "${FASTA_FILE}" + else + # File is not compressed, just copy/rename + OUTPUT_FASTA="${DB_NAME}_${RELEASE}.fasta" + cp "${FASTA_FILE}" "${OUTPUT_FASTA}" || { + echo "Error: Failed to copy FASTA file" + cd .. + exit 1 + } + fi + else + echo "Error: FASTA file not found" + cd .. + exit 1 + fi + + # Check if we have sequences + if [ ! -s "${OUTPUT_FASTA}" ]; then + echo "Error: FASTA file is empty" + cd .. + exit 1 + fi + + # Get file size for user information + FILE_SIZE=$(du -h "${OUTPUT_FASTA}" | cut -f1) + echo "FASTA file size: ${FILE_SIZE}" + + echo "Creating BLAST database..." + # Create BLAST database for RNA sequences (use -dbtype nucl for nucleotide) + # Note: RNAcentral uses RNAcentral IDs (URS...) as sequence identifiers, + # which matches the format expected by the RNACentralSearch class + DB_OUTPUT_NAME="${DB_NAME}_${RELEASE}" + makeblastdb -in "${OUTPUT_FASTA}" \ + -out "${DB_OUTPUT_NAME}" \ + -dbtype nucl \ + -parse_seqids \ + -title "RNAcentral_${DB_NAME}_${RELEASE}" + + echo "" + echo "BLAST database created successfully!" + echo "Database location: $(pwd)/${DB_OUTPUT_NAME}" + echo "" + echo "To use this database, set in your config (search_rna_config.yaml):" + echo " rnacentral_params:" + echo " use_local_blast: true" + echo " local_blast_db: $(pwd)/${DB_OUTPUT_NAME}" + echo "" + echo "Note: The database files are:" + ls -lh ${DB_OUTPUT_NAME}.* | head -5 + echo "" + if [ "${DB_SELECTION}" = "all" ]; then + echo "This database uses RNAcentral IDs (URS...), which matches the online" + echo "RNAcentral search API, ensuring consistent results between local and online searches." + else + echo "This is a subset database from ${DB_SELECTION} expert database." + echo "For full coverage matching online API, use 'all' option." + fi + + cd .. +done + +echo "" +echo "==========================================" +echo "All databases processed successfully!" +echo "==========================================" +echo "" + +# If multiple databases were downloaded, offer to merge them +if [ ${#DATABASES[@]} -gt 1 ] && [ "${DATABASES[0]}" != "all" ]; then + echo "Multiple databases downloaded. Creating merged database for unified search..." + MERGED_DIR="rnacentral_merged_${RELEASE}" + mkdir -p ${MERGED_DIR} + cd ${MERGED_DIR} + + MERGED_FASTA="rnacentral_merged_${RELEASE}.fasta" + MERGED_FASTA_TMP="${MERGED_FASTA}.tmp" + echo "Combining FASTA files from all databases..." + echo " Note: Duplicate sequence IDs will be removed (keeping first occurrence)..." + + # Combine all FASTA files into a temporary file + # Find actual database directories (may have different release versions) + FOUND_ANY=false + for DB_SELECTION in "${DATABASES[@]}"; do + [ "${DB_SELECTION}" = "all" ] && continue + + # Try current release version first, then search for any existing version + OUTPUT_FASTA="../rnacentral_${DB_SELECTION}_${RELEASE}/${DB_SELECTION}_${RELEASE}.fasta" + [ ! -f "${OUTPUT_FASTA}" ] && { + EXISTING_DIR=$(ls -d ../rnacentral_${DB_SELECTION}_* 2>/dev/null | head -1) + [ -n "${EXISTING_DIR}" ] && { + EXISTING_VERSION=$(basename "${EXISTING_DIR}" | sed "s/rnacentral_${DB_SELECTION}_//") + OUTPUT_FASTA="${EXISTING_DIR}/${DB_SELECTION}_${EXISTING_VERSION}.fasta" + } + } + + if [ -f "${OUTPUT_FASTA}" ]; then + echo " Adding ${DB_SELECTION} sequences..." + cat "${OUTPUT_FASTA}" >> "${MERGED_FASTA_TMP}" + FOUND_ANY=true + else + echo " Warning: Could not find FASTA file for ${DB_SELECTION}" + fi + done + + # Validate that we have files to merge + if [ "${FOUND_ANY}" = "false" ] || [ ! -s "${MERGED_FASTA_TMP}" ]; then + echo "Error: No FASTA files found to merge" + cd .. + rm -rf ${MERGED_DIR} + exit 1 + fi + + # Remove duplicates based on sequence ID (keeping first occurrence) + echo " Removing duplicate sequence IDs..." + awk ' + /^>/ { + # Process previous sequence if we have one + if (current_id != "" && !seen[current_id]) { + print current_header ORS current_seq + seen[current_id] = 1 + } + # Start new sequence + current_header = $0 + current_id = substr($0, 2) + sub(/[ \t].*/, "", current_id) # Extract ID up to first space/tab + current_seq = "" + next + } + { + # Accumulate sequence data by concatenating lines + current_seq = current_seq $0 + } + END { + # Process last sequence + if (current_id != "" && !seen[current_id]) { + print current_header ORS current_seq + } + } + ' "${MERGED_FASTA_TMP}" > "${MERGED_FASTA}" + rm -f "${MERGED_FASTA_TMP}" + + # Check if merged file was created and has content + if [ ! -s "${MERGED_FASTA}" ]; then + echo "Warning: Merged FASTA file is empty or not created" + cd .. + rm -rf ${MERGED_DIR} + else + FILE_SIZE=$(du -h "${MERGED_FASTA}" | cut -f1) + echo "Merged FASTA file size: ${FILE_SIZE}" + + echo "Creating merged BLAST database..." + MERGED_DB_NAME="rnacentral_merged_${RELEASE}" + makeblastdb -in "${MERGED_FASTA}" \ + -out "${MERGED_DB_NAME}" \ + -dbtype nucl \ + -parse_seqids \ + -title "RNAcentral_Merged_${RELEASE}" + + echo "" + echo "✓ Merged BLAST database created successfully!" + echo "Database location: $(pwd)/${MERGED_DB_NAME}" + echo "" + echo "To use the merged database, set in your config (search_rna_config.yaml):" + echo " rnacentral_params:" + echo " use_local_blast: true" + echo " local_blast_db: $(pwd)/${MERGED_DB_NAME}" + echo "" + echo "Note: The merged database includes: ${DATABASES[*]}" + cd .. + fi +fi + +echo "" +echo "Summary of downloaded databases:" +for DB_SELECTION in "${DATABASES[@]}"; do + if [ "${DB_SELECTION}" = "all" ]; then + OUTPUT_DIR="rnacentral_${RELEASE}" + DB_NAME="rnacentral" + else + OUTPUT_DIR="rnacentral_${DB_SELECTION}_${RELEASE}" + DB_NAME="${DB_SELECTION}" + fi + if [ -d "${OUTPUT_DIR}" ]; then + echo " - ${DB_NAME}: ${OUTPUT_DIR}/" + fi +done + +if [ -d "rnacentral_merged_${RELEASE}" ]; then + echo " - merged (all databases): rnacentral_merged_${RELEASE}/" + echo "" + echo "💡 Recommendation: Use the merged database for searching across all databases." +fi + diff --git a/examples/search/search_rna/search_rna.sh b/examples/search/search_rna/search_rna.sh new file mode 100644 index 00000000..04206c17 --- /dev/null +++ b/examples/search/search_rna/search_rna.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.run \ +--config_file examples/search/search_rna/search_rna_config.yaml + diff --git a/examples/search/search_rna/search_rna_config.yaml b/examples/search/search_rna/search_rna_config.yaml new file mode 100644 index 00000000..5b0e825e --- /dev/null +++ b/examples/search/search_rna/search_rna_config.yaml @@ -0,0 +1,26 @@ +global_params: + working_dir: cache + +nodes: + - id: read_step + op_name: read + type: source + dependencies: [] + params: + input_path: + - examples/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples + + - id: search_step + op_name: search + type: map_batch + dependencies: + - read_step # search_step depends on read_step + execution_params: + replicas: 1 + batch_size: 10 + params: + data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral + rnacentral_params: + use_local_blast: true # whether to use local blast for RNA search + local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension) + diff --git a/examples/search/search_rna_config.yaml b/examples/search/search_rna_config.yaml deleted file mode 100644 index 10422988..00000000 --- a/examples/search/search_rna_config.yaml +++ /dev/null @@ -1,14 +0,0 @@ -pipeline: - - name: read_step - op_key: read - params: - input_file: resources/input_examples/search_rna_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples - - - name: search_step - op_key: search - deps: [read_step] # search_step depends on read_step - params: - data_sources: [rnacentral] # data source for searcher, support: wikipedia, google, uniprot, ncbi, rnacentral - rnacentral_params: - use_local_blast: true # whether to use local blast for RNA search - local_blast_db: rnacentral_ensembl_gencode_YYYYMMDD/ensembl_gencode_YYYYMMDD # path to local BLAST database (without .nhr extension) diff --git a/examples/search/search_uniprot.sh b/examples/search/search_uniprot.sh deleted file mode 100644 index 8cb666c0..00000000 --- a/examples/search/search_uniprot.sh +++ /dev/null @@ -1,2 +0,0 @@ -python3 -m graphgen.run \ ---config_file graphgen/configs/search_protein_config.yaml diff --git a/graphgen/bases/base_operator.py b/graphgen/bases/base_operator.py index 300d3178..8d0cddb5 100644 --- a/graphgen/bases/base_operator.py +++ b/graphgen/bases/base_operator.py @@ -1,4 +1,5 @@ import inspect +import logging import os from abc import ABC, abstractmethod from typing import Iterable, Union @@ -6,7 +7,7 @@ import pandas as pd import ray -from graphgen.utils import CURRENT_LOGGER_VAR, set_logger +from graphgen.utils.log import CURRENT_LOGGER_VAR, set_logger class BaseOperator(ABC): @@ -29,7 +30,8 @@ def __init__(self, working_dir: str = "cache", op_name: str = None): log_file = os.path.join(log_dir, f"{self.op_name}_{worker_id_short}.log") self.logger = set_logger( - log_file=log_file, name=f"{self.op_name}.{worker_id_short}", force=True + log_file=log_file, name=f"{self.op_name}.{worker_id_short}", + console_level=logging.ERROR, force=True ) self.logger.info( diff --git a/graphgen/bases/base_reader.py b/graphgen/bases/base_reader.py index 5d2af735..ba72f410 100644 --- a/graphgen/bases/base_reader.py +++ b/graphgen/bases/base_reader.py @@ -39,6 +39,8 @@ def _should_keep_item(self, item: Dict[str, Any]) -> bool: "table", "equation", "protein", + "dna", + "rna", ], f"Unsupported item type: {item_type}" if item_type == "text": content = item.get(self.text_column, "").strip() diff --git a/graphgen/bases/base_searcher.py b/graphgen/bases/base_searcher.py index f680ab04..cba0315e 100644 --- a/graphgen/bases/base_searcher.py +++ b/graphgen/bases/base_searcher.py @@ -1,12 +1,37 @@ +import logging +import os from abc import ABC, abstractmethod from typing import Any, Dict, List +from graphgen.utils.log import set_logger + class BaseSearcher(ABC): """ Abstract base class for searching and retrieving data. """ + def __init__(self, working_dir: str = "cache"): + """ + Initialize the base searcher with a logger. + + :param working_dir: Working directory for log files. + """ + log_dir = os.path.join(working_dir, "logs") + searcher_name = self.__class__.__name__ + + # e.g. cache/logs/NCBISearch.log + log_file = os.path.join(log_dir, f"{searcher_name}.log") + + self.logger = set_logger( + log_file=log_file, name=searcher_name, + console_level=logging.ERROR, force=True + ) + + self.logger.info( + "[%s] Searcher initialized", searcher_name + ) + @abstractmethod async def search(self, query: str, **kwargs) -> List[Dict[str, Any]]: """ @@ -16,3 +41,7 @@ async def search(self, query: str, **kwargs) -> List[Dict[str, Any]]: :param kwargs: Additional keyword arguments for the searcher. :return: List of dictionaries containing the searcher results. """ + + def get_logger(self): + """Get the logger instance.""" + return self.logger diff --git a/graphgen/engine.py b/graphgen/engine.py index 47ed242a..e6a7da4f 100644 --- a/graphgen/engine.py +++ b/graphgen/engine.py @@ -1,21 +1,21 @@ -import os import inspect -import logging +import os from collections import defaultdict, deque from functools import wraps from typing import Any, Callable, Dict, List, Set -from dotenv import load_dotenv import ray import ray.data +from dotenv import load_dotenv from ray.data import DataContext from graphgen.bases import Config, Node -from graphgen.utils import logger from graphgen.common import init_llm, init_storage +from graphgen.utils import logger load_dotenv() + class Engine: def __init__( self, config: Dict[str, Any], functions: Dict[str, Callable], **ray_init_kwargs @@ -42,13 +42,12 @@ def __init__( existing_env_vars = ray_init_kwargs["runtime_env"].get("env_vars", {}) ray_init_kwargs["runtime_env"]["env_vars"] = { **all_env_vars, - **existing_env_vars + **existing_env_vars, } if not ray.is_initialized(): context = ray.init( ignore_reinit_error=True, - logging_level=logging.ERROR, log_to_driver=True, **ray_init_kwargs, ) diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py index 21344d74..4606715b 100644 --- a/graphgen/models/__init__.py +++ b/graphgen/models/__init__.py @@ -4,10 +4,11 @@ AtomicGenerator, CoTGenerator, MultiHopGenerator, + OmicsQAGenerator, QuizGenerator, VQAGenerator, ) -from .kg_builder import LightRAGKGBuilder, MMKGBuilder +from .kg_builder import LightRAGKGBuilder, MMKGBuilder, OmicsKGBuilder from .llm import HTTPClient, OllamaClient, OpenAIClient from .partitioner import ( AnchorBFSPartitioner, diff --git a/graphgen/models/evaluator/length_evaluator.py b/graphgen/models/evaluator/length_evaluator.py index d5c33211..9fbc6f3c 100644 --- a/graphgen/models/evaluator/length_evaluator.py +++ b/graphgen/models/evaluator/length_evaluator.py @@ -1,7 +1,8 @@ +import asyncio + from graphgen.bases.datatypes import QAPair from graphgen.models.evaluator.base_evaluator import BaseEvaluator from graphgen.models.tokenizer import Tokenizer -from graphgen.utils import create_event_loop class LengthEvaluator(BaseEvaluator): @@ -11,7 +12,8 @@ def __init__(self, tokenizer_name: str = "cl100k_base", max_concurrent: int = 10 self.tokenizer = Tokenizer(model_name=self.tokenizer_name) async def evaluate_single(self, pair: QAPair) -> float: - loop = create_event_loop() + # In async context, we should use the running loop + loop = asyncio.get_running_loop() return await loop.run_in_executor(None, self._calculate_length, pair.answer) def _calculate_length(self, text: str) -> float: diff --git a/graphgen/models/evaluator/mtld_evaluator.py b/graphgen/models/evaluator/mtld_evaluator.py index c106d86c..3423425b 100644 --- a/graphgen/models/evaluator/mtld_evaluator.py +++ b/graphgen/models/evaluator/mtld_evaluator.py @@ -2,7 +2,7 @@ from graphgen.bases.datatypes import QAPair from graphgen.models.evaluator.base_evaluator import BaseEvaluator -from graphgen.utils import NLTKHelper, create_event_loop, detect_main_language +from graphgen.utils import NLTKHelper, detect_main_language nltk_helper = NLTKHelper() @@ -18,7 +18,9 @@ def __init__(self, max_concurrent: int = 100): self.stopwords_zh: Set[str] = set(nltk_helper.get_stopwords("chinese")) async def evaluate_single(self, pair: QAPair) -> float: - loop = create_event_loop() + # In async context, we should use the running loop + import asyncio + loop = asyncio.get_running_loop() return await loop.run_in_executor(None, self._calculate_mtld_score, pair.answer) def _calculate_mtld_score(self, text: str, threshold=0.72) -> float: diff --git a/graphgen/models/generator/__init__.py b/graphgen/models/generator/__init__.py index 49f8979c..ec41f5dc 100644 --- a/graphgen/models/generator/__init__.py +++ b/graphgen/models/generator/__init__.py @@ -2,5 +2,6 @@ from .atomic_generator import AtomicGenerator from .cot_generator import CoTGenerator from .multi_hop_generator import MultiHopGenerator +from .omics_qa_generator import OmicsQAGenerator from .quiz_generator import QuizGenerator from .vqa_generator import VQAGenerator diff --git a/graphgen/models/generator/omics_qa_generator.py b/graphgen/models/generator/omics_qa_generator.py new file mode 100644 index 00000000..1c8e1112 --- /dev/null +++ b/graphgen/models/generator/omics_qa_generator.py @@ -0,0 +1,364 @@ +import re +from typing import Any, Optional + +from graphgen.bases import BaseGenerator +from graphgen.templates import OMICS_QA_GENERATION_PROMPT +from graphgen.utils import compute_content_hash, detect_main_language, logger + + +class OmicsQAGenerator(BaseGenerator): + """ + Unified QA generator for multi-omics data (DNA, RNA, Protein). + Automatically extracts and attaches molecule-specific caption information to QA pairs. + """ + + @staticmethod + def build_prompt( + batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] + ) -> str: + nodes, edges = batch + entities_str = "\n".join( + [ + f"{index + 1}. {node[0]}: {node[1]['description']}" + for index, node in enumerate(nodes) + ] + ) + + relationships_str = "\n".join( + [ + f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" + for index, edge in enumerate(edges) + ] + ) + language = detect_main_language(entities_str + relationships_str) + prompt = OMICS_QA_GENERATION_PROMPT[language].format( + entities=entities_str, relationships=relationships_str + ) + return prompt + + @staticmethod + def parse_response(response: str) -> Any: + """ + Parse the LLM response and return the generated QAs + :param response + :return: QA pairs + """ + qa_pairs = {} + qa_list = response.strip().split("\n\n") + for qa in qa_list: + match = re.search( + r"Question:\s*(.*?)\s*Answer:\s*(.*)", qa, re.DOTALL + ) or re.search(r"问题:\s*(.*?)\s*答案:\s*(.*)", qa, re.DOTALL) + + if match: + question = match.group(1).strip() + answer = match.group(2).strip() + else: + if qa: + logger.error("Failed to parse QA pair: %s", qa) + continue + question = question.strip('"') + answer = answer.strip('"') + logger.debug("Question: %s", question) + logger.debug("Answer: %s", answer) + qa_pairs[compute_content_hash(question)] = { + "question": question, + "answer": answer, + } + return qa_pairs + + @staticmethod + def _extract_caption(node_data: dict, molecule_type: str) -> Optional[dict]: # pylint: disable=too-many-branches + """ + Extract molecule-specific caption information from node data. + + :param node_data: Node data dictionary + :param molecule_type: Type of molecule ("dna", "rna", or "protein") + :return: Caption dictionary or None + """ + molecule_type_lower = molecule_type.lower() + + # Check if there's already a caption field (e.g., protein_caption, dna_caption, rna_caption) + caption_key = f"{molecule_type_lower}_caption" + if caption_key in node_data and node_data[caption_key]: + if isinstance(node_data[caption_key], list) and len(node_data[caption_key]) > 0: + # Always return the first element if it's a dict, otherwise return None for consistency + caption_val = node_data[caption_key][0] + return caption_val if isinstance(caption_val, dict) else None + if isinstance(node_data[caption_key], dict): + return node_data[caption_key] + + # Field mappings for each molecule type + field_mapping = { + "protein": [ + "protein_name", "gene_names", "organism", "function", + "sequence", "id", "database", "entry_name", "uniprot_id" + ], + "dna": [ + "gene_name", "gene_description", "organism", "chromosome", + "genomic_location", "function", "gene_type", "id", + "database", "sequence" + ], + "rna": [ + "rna_type", "description", "organism", "related_genes", + "gene_name", "so_term", "id", "database", + "rnacentral_id", "sequence" + ], + } + + # Extract fields based on molecule type + caption = {} + caption_fields = field_mapping.get(molecule_type_lower, []) + for field in caption_fields: + if field in node_data and node_data[field]: + caption[field] = node_data[field] + + # Special handling for protein: check search results and existing protein field + if molecule_type_lower == "protein": + # Check for search result data (from UniProt search) + if "_search_results" in node_data: + search_results = node_data["_search_results"] + if isinstance(search_results, list) and len(search_results) > 0: + first_result = search_results[0] + if isinstance(first_result, dict): + search_caption = { + "id": first_result.get("id", ""), + "protein_name": first_result.get("protein_name", ""), + "gene_names": first_result.get("gene_names", []), + "organism": first_result.get("organism", ""), + "function": first_result.get("function", []), + "sequence": node_data.get("sequence") or first_result.get("sequence", ""), + "database": "UniProt" + } + # Remove empty fields and return if any data exists + search_caption = {k: v for k, v in search_caption.items() if v} + if search_caption: + return search_caption + + # Merge with existing protein field if present + if "protein" in node_data and node_data["protein"]: + existing_protein = node_data["protein"] + if isinstance(existing_protein, list) and len(existing_protein) > 0: + existing_protein = ( + existing_protein[0] + if isinstance(existing_protein[0], dict) + else existing_protein + ) + if isinstance(existing_protein, dict): + for key, value in existing_protein.items(): + if key not in caption and value: + caption[key] = value + # Ensure sequence from node_data takes precedence + if "sequence" in node_data and node_data["sequence"]: + caption["sequence"] = node_data["sequence"] + + # Fallback to description if no caption found + if not caption and "description" in node_data: + description = node_data["description"] + if isinstance(description, str) and len(description) > 10: + caption["description"] = description + + return caption if caption else None + + @staticmethod + def _detect_molecule_type(nodes: list[tuple[str, dict]]) -> str: + """ + Detect molecule type from nodes. + Priority: Check node type, then check metadata, then check node data fields. + + :param nodes: List of (node_id, node_data) tuples + :return: Detected molecule type ("dna", "rna", "protein", or "unknown") + """ + if not nodes: + return "unknown" + + # Type indicators for each molecule type + type_indicators = { + "protein": { + "fields": ["protein_name", "uniprot_id", "entry_name", "protein_caption"], + "source_prefix": "protein-", + "description_keywords": ["protein"], + }, + "dna": { + "fields": ["gene_name", "chromosome", "genomic_location"], + "source_prefix": "dna-", + "description_keywords": ["gene", "dna", "chromosome"], + }, + "rna": { + "fields": ["rna_type", "rnacentral_id"], + "source_prefix": "rna-", + "description_keywords": ["rna", "transcript"], + }, + } + + for _, node_data in nodes: + # Priority 1: Check explicit type fields (most reliable) + for field in ["type", "molecule_type"]: + value = node_data.get(field, "").lower() + if value in ("dna", "rna", "protein"): + return value + + # Priority 2: Check source_id prefix + source_id = node_data.get("source_id", "").lower() + for mol_type, indicators in type_indicators.items(): + if source_id.startswith(indicators["source_prefix"]): + return mol_type + + # Priority 3: Check type-specific fields + for mol_type, indicators in type_indicators.items(): + if any(key in node_data for key in indicators["fields"]): + # Special check for DNA: need chromosome or genomic_location + if mol_type == "dna" and not any(key in node_data for key in ["chromosome", "genomic_location"]): + continue + return mol_type + + # Priority 4: Check description keywords + description = node_data.get("description", "").lower() + for mol_type, indicators in type_indicators.items(): + keywords = indicators["description_keywords"] + if any(kw in description for kw in keywords): + # Special check: "protein" in description but not "gene" + if mol_type == "protein" and "gene" in description: + continue + return mol_type + + return "unknown" + + async def generate( + self, + batch: tuple[ + list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]] + ], + ) -> dict[str, Any]: + """ + Generate QAs based on a given batch. + Automatically extracts and attaches molecule-specific caption information. + + :param batch + :return: QA pairs with attached molecule captions + """ + result = {} + prompt = self.build_prompt(batch) + response = await self.llm_client.generate_answer(prompt) + qa_pairs = self.parse_response(response) # generate one or more QA pairs + + nodes, _ = batch + + # Detect molecule type from nodes + molecule_type = self._detect_molecule_type(nodes) + + # Extract captions for all molecule types from nodes + captions = {"dna": None, "rna": None, "protein": None} + caption_attached = False + + for node in nodes: + _, node_data = node + + # Check for pre-extracted captions (from partition_service) + for mol_type in ["dna", "rna", "protein"]: + caption_key = f"{mol_type}_caption" + if caption_key in node_data and node_data[caption_key]: + captions[mol_type] = node_data[caption_key] + caption_attached = True + + # If no pre-extracted captions, extract from node_data using the detected molecule_type + if not caption_attached: + caption = self._extract_caption(node_data, molecule_type) + if caption: + captions[molecule_type] = caption + caption_attached = True + break # Only need to extract once per batch + + # Attach all captions to QA pairs + for qa in qa_pairs.values(): + qa["dna"] = captions["dna"] if captions["dna"] else "" + qa["rna"] = captions["rna"] if captions["rna"] else "" + qa["protein"] = captions["protein"] if captions["protein"] else "" + + if not caption_attached: + node_sample = ( + dict(list(nodes[0][1].items())[:5]) if nodes else 'No nodes' + ) + logger.warning( + "No caption extracted for molecule_type=%s. Node data sample: %s", + molecule_type, node_sample + ) + + result.update(qa_pairs) + return result + + @staticmethod + def format_generation_results( + results: list[dict], output_data_format: str + ) -> list[dict[str, Any]]: + """ + Format generation results with molecule-specific caption fields. + Supports dna, rna, and protein fields in output. + """ + # Extract QA pairs and molecule captions + qa_items = [ + { + "question": v["question"], + "answer": v["answer"], + "dna": v.get("dna", ""), + "rna": v.get("rna", ""), + "protein": v.get("protein", ""), + } + for item in results + for k, v in item.items() + ] + + # Format based on output format + if output_data_format == "Alpaca": + return [ + { + "instruction": qa["question"], + "input": "", + "output": qa["answer"], + "dna": qa["dna"], + "rna": qa["rna"], + "protein": qa["protein"], + } + for qa in qa_items + ] + if output_data_format == "Sharegpt": + return [ + { + "conversations": [ + { + "from": "human", + "value": [ + { + "text": qa["question"], + "dna": qa["dna"], + "rna": qa["rna"], + "protein": qa["protein"], + } + ], + }, + {"from": "gpt", "value": qa["answer"]}, + ] + } + for qa in qa_items + ] + if output_data_format == "ChatML": + return [ + { + "messages": [ + { + "role": "user", + "content": [ + { + "text": qa["question"], + "dna": qa["dna"], + "rna": qa["rna"], + "protein": qa["protein"], + } + ], + }, + {"role": "assistant", "content": qa["answer"]}, + ] + } + for qa in qa_items + ] + raise ValueError(f"Unknown output data format: {output_data_format}") diff --git a/graphgen/models/kg_builder/__init__.py b/graphgen/models/kg_builder/__init__.py index 1e7e2c44..aa0339c6 100644 --- a/graphgen/models/kg_builder/__init__.py +++ b/graphgen/models/kg_builder/__init__.py @@ -1,2 +1,3 @@ from .light_rag_kg_builder import LightRAGKGBuilder from .mm_kg_builder import MMKGBuilder +from .omics_kg_builder import OmicsKGBuilder diff --git a/graphgen/models/kg_builder/omics_kg_builder.py b/graphgen/models/kg_builder/omics_kg_builder.py new file mode 100644 index 00000000..c5c92a94 --- /dev/null +++ b/graphgen/models/kg_builder/omics_kg_builder.py @@ -0,0 +1,291 @@ +import re +from collections import Counter, defaultdict +from typing import Dict, List, Tuple + +import numpy as np + +from graphgen.bases import BaseGraphStorage, BaseKGBuilder, BaseLLMWrapper, Chunk +from graphgen.templates import KG_SUMMARIZATION_PROMPT, OMICS_KG_EXTRACTION_PROMPT +from graphgen.utils import ( + detect_main_language, + handle_single_entity_extraction, + handle_single_relationship_extraction, + logger, + pack_history_conversations, + split_string_by_multi_markers, +) + + +class OmicsKGBuilder(BaseKGBuilder): + """ + Knowledge graph builder for multi-omics data (DNA, RNA, protein). + Extracts entities and relationships from sequence chunks and their metadata. + """ + + def __init__(self, llm_client: BaseLLMWrapper, max_loop: int = 3): + super().__init__(llm_client) + self.max_loop = max_loop + + async def extract( + self, chunk: Chunk + ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]: + """ + Extract entities and relationships from a sequence chunk using the LLM client. + + :param chunk: Sequence chunk with metadata + :return: (nodes_data, edges_data) + """ + chunk_id = chunk.id + content = chunk.content + metadata = chunk.metadata + + # Extract sequence and metadata information + sequence_chunk = content or metadata.get("sequence", "") + # molecule_type is used in _format_metadata indirectly via metadata dict + _ = metadata.get("molecule_type", "").lower() + + # Build metadata text for prompt + metadata_text = self._format_metadata(metadata) + + # Detect language from metadata text (defaults to English if no Chinese detected) + language = detect_main_language(metadata_text) + + # Build prompt with sequence and metadata + hint_prompt = OMICS_KG_EXTRACTION_PROMPT[language]["TEMPLATE"].format( + **OMICS_KG_EXTRACTION_PROMPT["FORMAT"], + metadata_text=metadata_text, + sequence_chunk=sequence_chunk[:500] if sequence_chunk else "", # Limit sequence length in prompt + ) + + # step 2: initial glean + final_result = await self.llm_client.generate_answer(hint_prompt) + logger.debug("First extraction result: %s", final_result) + + # step 3: iterative refinement + history = pack_history_conversations(hint_prompt, final_result) + for loop_idx in range(self.max_loop): + if_loop_result = await self.llm_client.generate_answer( + text=OMICS_KG_EXTRACTION_PROMPT[language]["IF_LOOP"], history=history + ) + if_loop_result = if_loop_result.strip().strip('"').strip("'").lower() + if if_loop_result != "yes": + break + + glean_result = await self.llm_client.generate_answer( + text=OMICS_KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history + ) + logger.debug("Loop %s glean: %s", loop_idx + 1, glean_result) + + history += pack_history_conversations( + OMICS_KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result + ) + final_result += glean_result + + # step 4: parse the final result + records = split_string_by_multi_markers( + final_result, + [ + OMICS_KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"], + OMICS_KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"], + ], + ) + + nodes = defaultdict(list) + edges = defaultdict(list) + + for record in records: + match = re.search(r"\((.*)\)", record) + if not match: + continue + inner = match.group(1) + + attributes = split_string_by_multi_markers( + inner, [OMICS_KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]] + ) + + entity = await handle_single_entity_extraction(attributes, chunk_id) + if entity is not None: + nodes[entity["entity_name"]].append(entity) + continue + + relation = await handle_single_relationship_extraction(attributes, chunk_id) + if relation is not None: + key = (relation["src_id"], relation["tgt_id"]) + edges[key].append(relation) + + return dict(nodes), dict(edges) + + @staticmethod + def _format_metadata(metadata: dict) -> str: + """ + Format metadata dictionary into a readable text string for the prompt. + + :param metadata: Metadata dictionary from chunk + :return: Formatted metadata text + """ + # Filter out internal fields and format + exclude_fields = { + "_doc_id", + "_chunk_id", + "chunk_index", + "total_chunks", + "length", + "type", + "content", + "sequence", + } + + metadata_items = [] + for key, value in metadata.items(): + if key in exclude_fields: + continue + # Handle numpy arrays and other array-like objects + if isinstance(value, np.ndarray): + if value.size == 0: + continue + # Convert numpy array to list for processing + value = value.tolist() + if value is None: + continue + # Check for empty string (but not numpy arrays which we already handled) + if isinstance(value, str) and value == "": + continue + if isinstance(value, list): + value = ", ".join(str(v) for v in value) + metadata_items.append(f"{key}: {value}") + + return "\n".join(metadata_items) if metadata_items else "No additional metadata available." + + async def merge_nodes( + self, + node_data: tuple[str, List[dict]], + kg_instance: BaseGraphStorage, + ) -> None: + """Merge extracted nodes into the knowledge graph.""" + entity_name, node_data = node_data + entity_types = [] + source_ids = [] + descriptions = [] + + node = kg_instance.get_node(entity_name) + if node is not None: + entity_types.append(node["entity_type"]) + source_ids.extend( + split_string_by_multi_markers(node["source_id"], [""]) + ) + descriptions.append(node["description"]) + + # Take the most frequent entity_type + entity_type = sorted( + Counter([dp["entity_type"] for dp in node_data] + entity_types).items(), + key=lambda x: x[1], + reverse=True, + )[0][0] + + description = "".join( + sorted(set([dp["description"] for dp in node_data] + descriptions)) + ) + description = await self._handle_kg_summary(entity_name, description) + + source_id = "".join( + set([dp["source_id"] for dp in node_data] + source_ids) + ) + + node_data_dict = { + "entity_type": entity_type, + "description": description, + "source_id": source_id, + } + + # Preserve sequence from existing node if present (e.g., added by partition_service) + if node is not None and "sequence" in node and node["sequence"]: + node_data_dict["sequence"] = node["sequence"] + + kg_instance.upsert_node(entity_name, node_data=node_data_dict) + + async def merge_edges( + self, + edges_data: tuple[Tuple[str, str], List[dict]], + kg_instance: BaseGraphStorage, + ) -> None: + """Merge extracted edges into the knowledge graph.""" + (src_id, tgt_id), edge_data = edges_data + + # Skip self-loops (edges where source and target are the same) + # This can happen when LLM extracts invalid relationships + if src_id == tgt_id: + logger.debug("Skipping self-loop edge: (%s, %s)", src_id, tgt_id) + return + + source_ids = [] + descriptions = [] + + edge = kg_instance.get_edge(src_id, tgt_id) + if edge is not None: + source_ids.extend( + split_string_by_multi_markers(edge["source_id"], [""]) + ) + descriptions.append(edge["description"]) + + description = "".join( + sorted(set([dp["description"] for dp in edge_data] + descriptions)) + ) + source_id = "".join( + set([dp["source_id"] for dp in edge_data] + source_ids) + ) + + for insert_id in [src_id, tgt_id]: + if not kg_instance.has_node(insert_id): + kg_instance.upsert_node( + insert_id, + node_data={ + "source_id": source_id, + "description": description, + "entity_type": "UNKNOWN", + }, + ) + + description = await self._handle_kg_summary( + f"({src_id}, {tgt_id})", description + ) + + kg_instance.upsert_edge( + src_id, + tgt_id, + edge_data={"source_id": source_id, "description": description}, + ) + + async def _handle_kg_summary( + self, + entity_or_relation_name: str, + description: str, + max_summary_tokens: int = 200, + ) -> str: + """ + Handle knowledge graph summary for omics entities/relations. + + :param entity_or_relation_name + :param description + :param max_summary_tokens + :return summary + """ + tokenizer_instance = self.llm_client.tokenizer + language = detect_main_language(description) + + tokens = tokenizer_instance.encode(description) + if len(tokens) < max_summary_tokens: + return description + + use_description = tokenizer_instance.decode(tokens[:max_summary_tokens]) + prompt = KG_SUMMARIZATION_PROMPT[language]["TEMPLATE"].format( + entity_name=entity_or_relation_name, + description_list=use_description.split(""), + **KG_SUMMARIZATION_PROMPT["FORMAT"], + ) + new_description = await self.llm_client.generate_answer(prompt) + logger.info( + "Entity or relation %s summary: %s", + entity_or_relation_name, + new_description, + ) + return new_description diff --git a/graphgen/models/llm/local/sglang_wrapper.py b/graphgen/models/llm/local/sglang_wrapper.py index e8648613..1918fc79 100644 --- a/graphgen/models/llm/local/sglang_wrapper.py +++ b/graphgen/models/llm/local/sglang_wrapper.py @@ -13,10 +13,10 @@ class SGLangWrapper(BaseLLMWrapper): def __init__( self, model: str, + tp_size: int = 1, temperature: float = 0.0, top_p: float = 1.0, topk: int = 5, - tp_size: int = 1, **kwargs: Any, ): super().__init__(temperature=temperature, top_p=top_p, **kwargs) diff --git a/graphgen/models/partitioner/anchor_bfs_partitioner.py b/graphgen/models/partitioner/anchor_bfs_partitioner.py index 09133af7..b76e3ab0 100644 --- a/graphgen/models/partitioner/anchor_bfs_partitioner.py +++ b/graphgen/models/partitioner/anchor_bfs_partitioner.py @@ -1,6 +1,6 @@ import random from collections import deque -from typing import Any, Iterable, List, Literal, Set, Tuple +from typing import Any, Iterable, List, Set, Tuple from graphgen.bases import BaseGraphStorage from graphgen.bases.datatypes import Community @@ -22,12 +22,16 @@ class AnchorBFSPartitioner(BFSPartitioner): def __init__( self, - *, - anchor_type: Literal["image"] = "image", + anchor_type: list | None = None, anchor_ids: Set[str] | None = None, ) -> None: super().__init__() - self.anchor_type = anchor_type + if anchor_type is None: + anchor_type = ["image"] + if isinstance(anchor_type, str): + self.anchor_types = [anchor_type] + else: + self.anchor_types = list(anchor_type) self.anchor_ids = anchor_ids def partition( @@ -60,7 +64,7 @@ def partition( if comm_n or comm_e: yield Community(id=seed_node, nodes=comm_n, edges=comm_e) - def _pick_anchor_ids( + def _pick_anchor_ids( # pylint: disable=too-many-branches self, nodes: List[tuple[str, dict]], ) -> Set[str]: @@ -68,10 +72,53 @@ def _pick_anchor_ids( return self.anchor_ids anchor_ids: Set[str] = set() + anchor_types_lower = [at.lower() for at in self.anchor_types] + for node_id, meta in nodes: + # Check if node matches any of the anchor types + matched = False + + # Check 1: entity_type (for image, etc.) node_type = str(meta.get("entity_type", "")).lower() - if self.anchor_type.lower() in node_type: + for anchor_type_lower in anchor_types_lower: + if anchor_type_lower in node_type: + anchor_ids.add(node_id) + matched = True + break + + if matched: + continue + + # Check 2: molecule_type (for omics data: dna, rna, protein) + molecule_type = str(meta.get("molecule_type", "")).lower() + if molecule_type in anchor_types_lower: anchor_ids.add(node_id) + continue + + # Check 3: source_id prefix (for omics data: dna-, rna-, protein-) + source_id = str(meta.get("source_id", "")).lower() + for anchor_type_lower in anchor_types_lower: + if source_id.startswith(f"{anchor_type_lower}-"): + anchor_ids.add(node_id) + matched = True + break + + if matched: + continue + + # Check 4: Check if source_id contains multiple IDs separated by + if "" in source_id: + source_ids = source_id.split("") + for sid in source_ids: + sid = sid.strip() + for anchor_type_lower in anchor_types_lower: + if sid.startswith(f"{anchor_type_lower}-"): + anchor_ids.add(node_id) + matched = True + break + if matched: + break + return anchor_ids @staticmethod @@ -113,7 +160,21 @@ def _grow_community( if it in used_e: continue used_e.add(it) - u, v = it + # Convert frozenset to tuple for edge representation + # Note: Self-loops should be filtered during graph construction, + # but we handle edge cases defensively + try: + u, v = tuple(it) + except ValueError: + # Handle edge case: frozenset with unexpected number of elements + # This should not happen if graph construction is correct + edge_nodes = list(it) + if len(edge_nodes) == 1: + # Self-loop edge (should have been filtered during graph construction) + u, v = edge_nodes[0], edge_nodes[0] + else: + # Invalid edge, skip it + continue comm_e.append((u, v)) cnt += 1 for n in it: diff --git a/graphgen/models/searcher/db/ncbi_searcher.py b/graphgen/models/searcher/db/ncbi_searcher.py index f453c700..efaacf4b 100644 --- a/graphgen/models/searcher/db/ncbi_searcher.py +++ b/graphgen/models/searcher/db/ncbi_searcher.py @@ -19,16 +19,15 @@ ) from graphgen.bases import BaseSearcher -from graphgen.utils import logger @lru_cache(maxsize=None) def _get_pool(): - return ThreadPoolExecutor(max_workers=10) + return ThreadPoolExecutor(max_workers=20) # NOTE:can increase for better parallelism # ensure only one NCBI request at a time -_ncbi_lock = asyncio.Lock() +_blast_lock = asyncio.Lock() class NCBISearch(BaseSearcher): @@ -49,6 +48,8 @@ def __init__( email: str = "email@example.com", api_key: str = "", tool: str = "GraphGen", + blast_num_threads: int = 4, + working_dir: str = "cache", ): """ Initialize the NCBI Search client. @@ -59,8 +60,10 @@ def __init__( email (str): Email address for NCBI API requests. api_key (str): API key for NCBI API requests, see https://account.ncbi.nlm.nih.gov/settings/. tool (str): Tool name for NCBI API requests. + blast_num_threads (int): Number of threads for BLAST search. + working_dir (str): Working directory for log files. """ - super().__init__() + super().__init__(working_dir=working_dir) Entrez.timeout = 60 # 60 seconds timeout Entrez.email = email Entrez.tool = tool @@ -70,9 +73,17 @@ def __init__( Entrez.sleep_between_tries = 5 self.use_local_blast = use_local_blast self.local_blast_db = local_blast_db - if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"): - logger.error("Local BLAST database files not found. Please check the path.") - self.use_local_blast = False + self.blast_num_threads = blast_num_threads + if self.use_local_blast: + # Check for single-file database (.nhr) or multi-file database (.00.nhr) + db_exists = ( + os.path.isfile(f"{self.local_blast_db}.nhr") or + os.path.isfile(f"{self.local_blast_db}.00.nhr") + ) + if not db_exists: + self.logger.error("Local BLAST database files not found. Please check the path.") + self.logger.error("Expected: %s.nhr or %s.00.nhr", self.local_blast_db, self.local_blast_db) + self.use_local_blast = False @staticmethod def _nested_get(data: dict, *keys, default=None): @@ -87,14 +98,16 @@ def _nested_get(data: dict, *keys, default=None): def _infer_molecule_type_detail(accession: Optional[str], gene_type: Optional[int] = None) -> Optional[str]: """Infer molecule_type_detail from accession prefix or gene type.""" if accession: - if accession.startswith(("NM_", "XM_")): - return "mRNA" - if accession.startswith(("NC_", "NT_")): - return "genomic DNA" - if accession.startswith(("NR_", "XR_")): - return "RNA" - if accession.startswith("NG_"): - return "genomic region" + # Map accession prefixes to molecule types + prefix_map = { + ("NM_", "XM_"): "mRNA", + ("NC_", "NT_"): "genomic DNA", + ("NR_", "XR_"): "RNA", + ("NG_",): "genomic region", + } + for prefixes, mol_type in prefix_map.items(): + if accession.startswith(prefixes): + return mol_type # Fallback: infer from gene type if available if gene_type is not None: gene_type_map = { @@ -153,7 +166,6 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: None, ) # Fallback: if no type 3 accession, try any available accession - # This is needed for genes that don't have mRNA transcripts but have other sequence records if not representative_accession: representative_accession = next( ( @@ -209,6 +221,12 @@ def _gene_record_to_dict(self, gene_record, gene_id: str) -> dict: "_representative_accession": representative_accession, } + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + ) def get_by_gene_id(self, gene_id: str, preferred_accession: Optional[str] = None) -> Optional[dict]: """Get gene information by Gene ID.""" def _extract_metadata_from_genbank(result: dict, accession: str): @@ -217,12 +235,7 @@ def _extract_metadata_from_genbank(result: dict, accession: str): record = SeqIO.read(handle, "genbank") result["title"] = record.description - result["molecule_type_detail"] = ( - "mRNA" if accession.startswith(("NM_", "XM_")) else - "genomic DNA" if accession.startswith(("NC_", "NT_")) else - "RNA" if accession.startswith(("NR_", "XR_")) else - "genomic region" if accession.startswith("NG_") else "N/A" - ) + result["molecule_type_detail"] = self._infer_molecule_type_detail(accession) or "N/A" for feature in record.features: if feature.type == "source": @@ -249,7 +262,7 @@ def _extract_sequence_from_fasta(result: dict, accession: str): result["sequence"] = str(fasta_record.seq) result["sequence_length"] = len(fasta_record.seq) except Exception as fasta_exc: - logger.warning( + self.logger.warning( "Failed to extract sequence from accession %s using FASTA format: %s", accession, fasta_exc ) @@ -257,25 +270,62 @@ def _extract_sequence_from_fasta(result: dict, accession: str): result["sequence_length"] = None return result + def _extract_sequence(result: dict, accession: str): + """ + Extract sequence using the appropriate method based on configuration. + If use_local_blast=True, use local database. Otherwise, use NCBI API. + Always fetches sequence (no option to skip). + """ + # If using local BLAST, use local database + if self.use_local_blast: + sequence = self._extract_sequence_from_local_db(accession) + + if sequence: + result["sequence"] = sequence + result["sequence_length"] = len(sequence) + else: + # Failed to extract from local DB, set to None (no fallback to API) + result["sequence"] = None + result["sequence_length"] = None + self.logger.warning( + "Failed to extract sequence from local DB for accession %s. " + "Not falling back to NCBI API as use_local_blast=True.", + accession + ) + else: + # Use NCBI API to fetch sequence + result = _extract_sequence_from_fasta(result, accession) + + return result + try: with Entrez.efetch(db="gene", id=gene_id, retmode="xml") as handle: gene_record = Entrez.read(handle) - if not gene_record: - return None - result = self._gene_record_to_dict(gene_record, gene_id) - if accession := (preferred_accession or result.get("_representative_accession")): - result = _extract_metadata_from_genbank(result, accession) - result = _extract_sequence_from_fasta(result, accession) + if not gene_record: + return None + + result = self._gene_record_to_dict(gene_record, gene_id) - result.pop("_representative_accession", None) - return result + if accession := (preferred_accession or result.get("_representative_accession")): + result = _extract_metadata_from_genbank(result, accession) + # Extract sequence using appropriate method + result = _extract_sequence(result, accession) + + result.pop("_representative_accession", None) + return result except (RequestException, IncompleteRead): raise except Exception as exc: - logger.error("Gene ID %s not found: %s", gene_id, exc) + self.logger.error("Gene ID %s not found: %s", gene_id, exc) return None + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + ) def get_by_accession(self, accession: str) -> Optional[dict]: """Get sequence information by accession number.""" def _extract_gene_id(link_handle): @@ -297,20 +347,28 @@ def _extract_gene_id(link_handle): gene_id = _extract_gene_id(link_handle) if not gene_id: - logger.warning("Accession %s has no associated GeneID", accession) + self.logger.warning("Accession %s has no associated GeneID", accession) return None result = self.get_by_gene_id(gene_id, preferred_accession=accession) + if result: result["id"] = accession result["url"] = f"https://www.ncbi.nlm.nih.gov/nuccore/{accession}" + return result except (RequestException, IncompleteRead): raise except Exception as exc: - logger.error("Accession %s not found: %s", accession, exc) + self.logger.error("Accession %s not found: %s", accession, exc) return None + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_exception_type((RequestException, IncompleteRead)), + reraise=True, + ) def get_best_hit(self, keyword: str) -> Optional[dict]: """Search NCBI Gene database with a keyword and return the best hit.""" if not keyword.strip(): @@ -320,31 +378,87 @@ def get_best_hit(self, keyword: str) -> Optional[dict]: for search_term in [f"{keyword}[Gene] OR {keyword}[All Fields]", keyword]: with Entrez.esearch(db="gene", term=search_term, retmax=1, sort="relevance") as search_handle: search_results = Entrez.read(search_handle) - if len(gene_id := search_results.get("IdList", [])) > 0: - return self.get_by_gene_id(gene_id) + + if len(gene_id := search_results.get("IdList", [])) > 0: + result = self.get_by_gene_id(gene_id[0]) + return result except (RequestException, IncompleteRead): raise except Exception as e: - logger.error("Keyword %s not found: %s", keyword, e) + self.logger.error("Keyword %s not found: %s", keyword, e) return None + def _extract_sequence_from_local_db(self, accession: str) -> Optional[str]: + """Extract sequence from local BLAST database using blastdbcmd.""" + try: + cmd = [ + "blastdbcmd", + "-db", self.local_blast_db, + "-entry", accession, + "-outfmt", "%s" # Only sequence, no header + ] + sequence = subprocess.check_output( + cmd, + text=True, + timeout=10, # 10 second timeout for local extraction + stderr=subprocess.DEVNULL + ).strip() + return sequence if sequence else None + except subprocess.TimeoutExpired: + self.logger.warning("Timeout extracting sequence from local DB for accession %s", accession) + return None + except Exception as exc: + self.logger.warning("Failed to extract sequence from local DB for accession %s: %s", accession, exc) + return None + def _local_blast(self, seq: str, threshold: float) -> Optional[str]: - """Perform local BLAST search using local BLAST database.""" + """ + Perform local BLAST search using local BLAST database. + Optimized with multi-threading and faster output format. + """ try: with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp: tmp.write(f">query\n{seq}\n") tmp_name = tmp.name + # Optimized BLAST command with: + # - num_threads: Use multiple threads for faster search + # - outfmt 6 sacc: Only return accession (minimal output) + # - max_target_seqs 1: Only need the best hit + # - evalue: Threshold for significance cmd = [ "blastn", "-db", self.local_blast_db, "-query", tmp_name, - "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc" + "-evalue", str(threshold), + "-max_target_seqs", "1", + "-num_threads", str(self.blast_num_threads), + "-outfmt", "6 sacc" # Only accession, tab-separated ] - logger.debug("Running local blastn: %s", " ".join(cmd)) - out = subprocess.check_output(cmd, text=True).strip() + self.logger.debug("Running local blastn (threads=%d): %s", + self.blast_num_threads, " ".join(cmd)) + + # Run BLAST with timeout to avoid hanging + try: + out = subprocess.check_output( + cmd, + text=True, + timeout=300, # 5 minute timeout for BLAST search + stderr=subprocess.DEVNULL # Suppress BLAST warnings to reduce I/O + ).strip() + except subprocess.TimeoutExpired: + self.logger.warning("BLAST search timed out after 5 minutes for sequence") + os.remove(tmp_name) + return None + os.remove(tmp_name) return out.split("\n", maxsplit=1)[0] if out else None except Exception as exc: - logger.error("Local blastn failed: %s", exc) + self.logger.error("Local blastn failed: %s", exc) + # Clean up temp file if it still exists + try: + if 'tmp_name' in locals(): + os.remove(tmp_name) + except Exception: + pass return None def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: @@ -362,13 +476,13 @@ def _extract_and_normalize_sequence(sequence: str) -> Optional[str]: def _process_network_blast_result(blast_record, seq: str, threshold: float) -> Optional[dict]: """Process network BLAST result and return dictionary or None.""" if not blast_record.alignments: - logger.info("No BLAST hits found for the given sequence.") + self.logger.info("No BLAST hits found for the given sequence.") return None best_alignment = blast_record.alignments[0] best_hsp = best_alignment.hsps[0] if best_hsp.expect > threshold: - logger.info("No BLAST hits below the threshold E-value.") + self.logger.info("No BLAST hits below the threshold E-value.") return None hit_id = best_alignment.hit_id @@ -389,23 +503,35 @@ def _process_network_blast_result(blast_record, seq: str, threshold: float) -> O try: if not (seq := _extract_and_normalize_sequence(sequence)): - logger.error("Empty or invalid DNA sequence provided.") + self.logger.error("Empty or invalid DNA sequence provided.") return None # Try local BLAST first if enabled - if self.use_local_blast and (accession := self._local_blast(seq, threshold)): - logger.debug("Local BLAST found accession: %s", accession) - return self.get_by_accession(accession) - - # Fall back to network BLAST - logger.debug("Falling back to NCBIWWW.qblast") + if self.use_local_blast: + accession = self._local_blast(seq, threshold) + + if accession: + self.logger.debug("Local BLAST found accession: %s", accession) + # When using local BLAST, skip sequence fetching by default (faster, fewer API calls) + # Sequence is already known from the query, so we only need metadata + result = self.get_by_accession(accession) + return result + + self.logger.info( + "Local BLAST found no match for sequence. " + "API fallback disabled when using local database." + ) + return None + # Fall back to network BLAST only if local BLAST is not enabled + self.logger.debug("Falling back to NCBIWWW.qblast") with NCBIWWW.qblast("blastn", "nr", seq, hitlist_size=1, expect=threshold) as result_handle: - return _process_network_blast_result(NCBIXML.read(result_handle), seq, threshold) + result = _process_network_blast_result(NCBIXML.read(result_handle), seq, threshold) + return result except (RequestException, IncompleteRead): raise except Exception as e: - logger.error("BLAST search failed: %s", e) + self.logger.error("BLAST search failed: %s", e) return None @retry( @@ -417,25 +543,34 @@ def _process_network_blast_result(blast_record, seq: str, threshold: float) -> O async def search(self, query: str, threshold: float = 0.01, **kwargs) -> Optional[Dict]: """Search NCBI with either a gene ID, accession number, keyword, or DNA sequence.""" if not query or not isinstance(query, str): - logger.error("Empty or non-string input.") + self.logger.error("Empty or non-string input.") return None query = query.strip() - logger.debug("NCBI search query: %s", query) + self.logger.debug("NCBI search query: %s", query) loop = asyncio.get_running_loop() - # limit concurrent requests (NCBI rate limit: max 3 requests per second) - async with _ncbi_lock: - # Auto-detect query type and execute in thread pool - if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I): - result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold) - elif re.fullmatch(r"^\d+$", query): - result = await loop.run_in_executor(_get_pool(), self.get_by_gene_id, query) - elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I): - result = await loop.run_in_executor(_get_pool(), self.get_by_accession, query) - else: - result = await loop.run_in_executor(_get_pool(), self.get_best_hit, query) + # Auto-detect query type and execute in thread pool + # All methods need lock because they all call NCBI API (rate limit: max 3 requests per second) + # Even if get_by_fasta uses local BLAST, it still calls get_by_accession which needs API + async def _execute_with_lock(func, *args): + """Execute function with lock for NCBI API calls.""" + async with _blast_lock: + return await loop.run_in_executor(_get_pool(), func, *args) + + if query.startswith(">") or re.fullmatch(r"[ATCGN\s]+", query, re.I): + # FASTA sequence: always use lock (even with local BLAST, get_by_accession needs API) + result = await _execute_with_lock(self.get_by_fasta, query, threshold) + elif re.fullmatch(r"^\d+$", query): + # Gene ID: always use lock (network API call) + result = await _execute_with_lock(self.get_by_gene_id, query) + elif re.fullmatch(r"[A-Z]{2}_\d+\.?\d*", query, re.I): + # Accession: always use lock (network API call) + result = await _execute_with_lock(self.get_by_accession, query) + else: + # Keyword: always use lock (network API call) + result = await _execute_with_lock(self.get_best_hit, query) if result: result["_search_query"] = query diff --git a/graphgen/models/searcher/db/rnacentral_searcher.py b/graphgen/models/searcher/db/rnacentral_searcher.py index 58c5e86e..2d5ef138 100644 --- a/graphgen/models/searcher/db/rnacentral_searcher.py +++ b/graphgen/models/searcher/db/rnacentral_searcher.py @@ -18,12 +18,11 @@ ) from graphgen.bases import BaseSearcher -from graphgen.utils import logger @lru_cache(maxsize=None) def _get_pool(): - return ThreadPoolExecutor(max_workers=10) + return ThreadPoolExecutor(max_workers=20) # NOTE:can increase for better parallelism class RNACentralSearch(BaseSearcher): """ @@ -35,14 +34,24 @@ class RNACentralSearch(BaseSearcher): API Documentation: https://rnacentral.org/api/v1 """ - def __init__(self, use_local_blast: bool = False, local_blast_db: str = "rna_db"): - super().__init__() + def __init__( + self, + use_local_blast: bool = False, + local_blast_db: str = "rna_db", + api_timeout: int = 30, + blast_num_threads: int = 4, + working_dir: str = "cache", + ): + super().__init__(working_dir=working_dir) self.base_url = "https://rnacentral.org/api/v1" self.headers = {"Accept": "application/json"} self.use_local_blast = use_local_blast self.local_blast_db = local_blast_db + self.api_timeout = api_timeout + self.blast_num_threads = blast_num_threads # Number of threads for BLAST search + if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.nhr"): - logger.error("Local BLAST database files not found. Please check the path.") + self.logger.error("Local BLAST database files not found. Please check the path.") self.use_local_blast = False @staticmethod @@ -58,7 +67,8 @@ def _rna_data_to_dict( acc = xref.get("accession", {}) if s := acc.get("species"): organisms.add(s) - if g := acc.get("gene", "").strip(): + gene_value = acc.get("gene") + if isinstance(gene_value, str) and (g := gene_value.strip()): gene_names.add(g) if m := xref.get("modifications"): modifications.extend(m) @@ -151,19 +161,29 @@ def get_by_rna_id(self, rna_id: str) -> Optional[dict]: url = f"{self.base_url}/rna/{rna_id}" url += "?flat=true" - resp = requests.get(url, headers=self.headers, timeout=30) + resp = requests.get(url, headers=self.headers, timeout=self.api_timeout) resp.raise_for_status() rna_data = resp.json() xrefs_data = rna_data.get("xrefs", []) - return self._rna_data_to_dict(rna_id, rna_data, xrefs_data) + result = self._rna_data_to_dict(rna_id, rna_data, xrefs_data) + return result + except requests.Timeout as e: + self.logger.warning("Timeout getting RNA ID %s (timeout=%ds): %s", rna_id, self.api_timeout, e) + return None except requests.RequestException as e: - logger.error("Network error getting RNA ID %s: %s", rna_id, e) + self.logger.error("Network error getting RNA ID %s: %s", rna_id, e) return None except Exception as e: # pylint: disable=broad-except - logger.error("Unexpected error getting RNA ID %s: %s", rna_id, e) + self.logger.error("Unexpected error getting RNA ID %s: %s", rna_id, e) return None + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=10), + retry=retry_if_exception_type((requests.Timeout, requests.RequestException)), + reraise=False, + ) def get_best_hit(self, keyword: str) -> Optional[dict]: """ Search RNAcentral with a keyword and return the best hit. @@ -172,20 +192,20 @@ def get_best_hit(self, keyword: str) -> Optional[dict]: """ keyword = keyword.strip() if not keyword: - logger.warning("Empty keyword provided to get_best_hit") + self.logger.warning("Empty keyword provided to get_best_hit") return None try: url = f"{self.base_url}/rna" params = {"search": keyword, "format": "json"} - resp = requests.get(url, params=params, headers=self.headers, timeout=30) + resp = requests.get(url, params=params, headers=self.headers, timeout=self.api_timeout) resp.raise_for_status() data = resp.json() results = data.get("results", []) if not results: - logger.info("No search results for keyword: %s", keyword) + self.logger.info("No search results for keyword: %s", keyword) return None first_result = results[0] @@ -195,36 +215,68 @@ def get_best_hit(self, keyword: str) -> Optional[dict]: detailed = self.get_by_rna_id(rna_id) if detailed: return detailed - logger.debug("Using search result data for %s", rna_id or "unknown") + self.logger.debug("Using search result data for %s", rna_id or "unknown") return self._rna_data_to_dict(rna_id or "", first_result) except requests.RequestException as e: - logger.error("Network error searching keyword '%s': %s", keyword, e) + self.logger.error("Network error searching keyword '%s': %s", keyword, e) return None except Exception as e: - logger.error("Unexpected error searching keyword '%s': %s", keyword, e) + self.logger.error("Unexpected error searching keyword '%s': %s", keyword, e) return None def _local_blast(self, seq: str, threshold: float) -> Optional[str]: - """Perform local BLAST search using local BLAST database.""" + """ + Perform local BLAST search using local BLAST database. + Optimized with multi-threading and faster output format. + """ try: + # Use temporary file for query sequence with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp: tmp.write(f">query\n{seq}\n") tmp_name = tmp.name + # Optimized BLAST command with: + # - num_threads: Use multiple threads for faster search + # - outfmt 6 sacc: Only return accession (minimal output) + # - max_target_seqs 1: Only need the best hit + # - evalue: Threshold for significance cmd = [ "blastn", "-db", self.local_blast_db, "-query", tmp_name, - "-evalue", str(threshold), "-max_target_seqs", "1", "-outfmt", "6 sacc" + "-evalue", str(threshold), + "-max_target_seqs", "1", + "-num_threads", str(self.blast_num_threads), + "-outfmt", "6 sacc" # Only accession, tab-separated ] - logger.debug("Running local blastn for RNA: %s", " ".join(cmd)) - out = subprocess.check_output(cmd, text=True).strip() + self.logger.debug("Running local blastn for RNA (threads=%d): %s", + self.blast_num_threads, " ".join(cmd)) + + # Run BLAST with timeout to avoid hanging + try: + out = subprocess.check_output( + cmd, + text=True, + timeout=300, # 5 minute timeout for BLAST search + stderr=subprocess.DEVNULL # Suppress BLAST warnings to reduce I/O + ).strip() + except subprocess.TimeoutExpired: + self.logger.warning("BLAST search timed out after 5 minutes for sequence") + os.remove(tmp_name) + return None + os.remove(tmp_name) return out.split("\n", maxsplit=1)[0] if out else None except Exception as exc: - logger.error("Local blastn failed: %s", exc) + self.logger.error("Local blastn failed: %s", exc) + # Clean up temp file if it still exists + try: + if 'tmp_name' in locals(): + os.remove(tmp_name) + except Exception: + pass return None - def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: + def get_by_fasta(self, sequence: str, threshold: float = 0.01) -> Optional[dict]: # pylint: disable=too-many-return-statements """ Search RNAcentral with an RNA sequence. Tries local BLAST first if enabled, falls back to RNAcentral API. @@ -240,23 +292,36 @@ def _extract_sequence(sequence: str) -> Optional[str]: seq = "".join(seq_lines[1:]) else: seq = sequence.strip().replace(" ", "").replace("\n", "") - return seq if seq and re.fullmatch(r"[AUCGN\s]+", seq, re.I) else None + # Accept both U (original RNA) and T + return seq if seq and re.fullmatch(r"[AUCGTN\s]+", seq, re.I) else None try: seq = _extract_sequence(sequence) if not seq: - logger.error("Empty or invalid RNA sequence provided.") + self.logger.error("Empty or invalid RNA sequence provided.") return None # Try local BLAST first if enabled if self.use_local_blast: accession = self._local_blast(seq, threshold) if accession: - logger.debug("Local BLAST found accession: %s", accession) - return self.get_by_rna_id(accession) + self.logger.debug("Local BLAST found accession: %s", accession) + detailed = self.get_by_rna_id(accession) + if detailed: + return detailed + self.logger.info( + "Local BLAST found accession %s but could not retrieve metadata from API.", + accession + ) + return None + self.logger.info( + "Local BLAST found no match for sequence. " + "API fallback disabled when using local database." + ) + return None - # Fall back to RNAcentral API if local BLAST didn't find result - logger.debug("Falling back to RNAcentral API.") + # Fall back to RNAcentral API only if local BLAST is not enabled + self.logger.debug("Falling back to RNAcentral API.") md5_hash = self._calculate_md5(seq) search_url = f"{self.base_url}/rna" @@ -269,15 +334,22 @@ def _extract_sequence(sequence: str) -> Optional[str]: results = search_results.get("results", []) if not results: - logger.info("No exact match found in RNAcentral for sequence") + self.logger.info("No exact match found in RNAcentral for sequence") return None + rna_id = results[0].get("rnacentral_id") - if not rna_id: - logger.error("No RNAcentral ID found in search results.") - return None - return self.get_by_rna_id(rna_id) + if rna_id: + detailed = self.get_by_rna_id(rna_id) + if detailed: + return detailed + # Fallback: use search result data if get_by_rna_id returns None + self.logger.debug("Using search result data for %s (get_by_rna_id returned None)", rna_id) + return self._rna_data_to_dict(rna_id, results[0]) + + self.logger.error("No RNAcentral ID found in search results.") + return None except Exception as e: - logger.error("Sequence search failed: %s", e) + self.logger.error("Sequence search failed: %s", e) return None @retry( @@ -289,18 +361,21 @@ def _extract_sequence(sequence: str) -> Optional[str]: async def search(self, query: str, threshold: float = 0.1, **kwargs) -> Optional[Dict]: """Search RNAcentral with either an RNAcentral ID, keyword, or RNA sequence.""" if not query or not isinstance(query, str): - logger.error("Empty or non-string input.") + self.logger.error("Empty or non-string input.") return None query = query.strip() - logger.debug("RNAcentral search query: %s", query) + self.logger.debug("RNAcentral search query: %s", query) loop = asyncio.get_running_loop() - # check if RNA sequence (AUCG characters, contains U) - if query.startswith(">") or ( - re.fullmatch(r"[AUCGN\s]+", query, re.I) and "U" in query.upper() - ): + # check if RNA sequence (AUCG or ATCG characters, contains U or T) + # Note: Sequences with T are also RNA sequences + is_rna_sequence = query.startswith(">") or ( + re.fullmatch(r"[AUCGTN\s]+", query, re.I) and + ("U" in query.upper() or "T" in query.upper()) + ) + if is_rna_sequence: result = await loop.run_in_executor(_get_pool(), self.get_by_fasta, query, threshold) # check if RNAcentral ID (typically starts with URS) elif re.fullmatch(r"URS\d+", query, re.I): diff --git a/graphgen/models/searcher/db/uniprot_searcher.py b/graphgen/models/searcher/db/uniprot_searcher.py index f5542f8c..22181d05 100644 --- a/graphgen/models/searcher/db/uniprot_searcher.py +++ b/graphgen/models/searcher/db/uniprot_searcher.py @@ -19,12 +19,11 @@ ) from graphgen.bases import BaseSearcher -from graphgen.utils import logger @lru_cache(maxsize=None) def _get_pool(): - return ThreadPoolExecutor(max_workers=10) + return ThreadPoolExecutor(max_workers=20) # NOTE:can increase for better parallelism # ensure only one BLAST searcher at a time @@ -39,12 +38,20 @@ class UniProtSearch(BaseSearcher): 3) Search with FASTA sequence (BLAST searcher). Note that NCBIWWW does not support async. """ - def __init__(self, use_local_blast: bool = False, local_blast_db: str = "sp_db"): - super().__init__() + def __init__( + self, + use_local_blast: bool = False, + local_blast_db: str = "sp_db", + blast_num_threads: int = 4, + working_dir: str = "cache", + ): + super().__init__(working_dir=working_dir) self.use_local_blast = use_local_blast self.local_blast_db = local_blast_db + self.blast_num_threads = blast_num_threads # Number of threads for BLAST search + if self.use_local_blast and not os.path.isfile(f"{self.local_blast_db}.phr"): - logger.error("Local BLAST database files not found. Please check the path.") + self.logger.error("Local BLAST database files not found. Please check the path.") self.use_local_blast = False def get_by_accession(self, accession: str) -> Optional[dict]: @@ -56,12 +63,12 @@ def get_by_accession(self, accession: str) -> Optional[dict]: except RequestException: # network-related errors raise except Exception as exc: # pylint: disable=broad-except - logger.error("Accession %s not found: %s", accession, exc) + self.logger.error("Accession %s not found: %s", accession, exc) return None @staticmethod def _swissprot_to_dict(record: SwissProt.Record) -> dict: - """error + """ Convert a SwissProt.Record to a dictionary. """ functions = [] @@ -101,10 +108,10 @@ def get_best_hit(self, keyword: str) -> Optional[Dict]: except RequestException: raise except Exception as e: # pylint: disable=broad-except - logger.error("Keyword %s not found: %s", keyword, e) + self.logger.error("Keyword %s not found: %s", keyword, e) return None - def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: + def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: # pylint: disable=too-many-return-statements """ Search UniProt with a FASTA sequence and return the best hit. :param fasta_sequence: The FASTA sequence. @@ -117,70 +124,76 @@ def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]: else: seq = fasta_sequence.strip() except Exception as e: # pylint: disable=broad-except - logger.error("Invalid FASTA sequence: %s", e) + self.logger.error("Invalid FASTA sequence: %s", e) return None if not seq: - logger.error("Empty FASTA sequence provided.") + self.logger.error("Empty FASTA sequence provided.") return None - accession = None if self.use_local_blast: accession = self._local_blast(seq, threshold) if accession: - logger.debug("Local BLAST found accession: %s", accession) + self.logger.debug("Local BLAST found accession: %s", accession) + return self.get_by_accession(accession) + self.logger.info( + "Local BLAST found no match for sequence. " + "API fallback disabled when using local database." + ) + return None - if not accession: - logger.debug("Falling back to NCBIWWW.qblast.") + # Fall back to network BLAST only if local BLAST is not enabled + self.logger.debug("Falling back to NCBIWWW.qblast.") - # UniProtKB/Swiss-Prot BLAST API - try: - logger.debug( - "Performing BLAST searcher for the given sequence: %s", seq - ) - result_handle = NCBIWWW.qblast( - program="blastp", - database="swissprot", - sequence=seq, - hitlist_size=1, - expect=threshold, - ) - blast_record = NCBIXML.read(result_handle) - except RequestException: - raise - except Exception as e: # pylint: disable=broad-except - logger.error("BLAST searcher failed: %s", e) - return None + # UniProtKB/Swiss-Prot BLAST API + try: + self.logger.debug( + "Performing BLAST searcher for the given sequence: %s", seq + ) + result_handle = NCBIWWW.qblast( + program="blastp", + database="swissprot", + sequence=seq, + hitlist_size=1, + expect=threshold, + ) + blast_record = NCBIXML.read(result_handle) + except RequestException: + raise + except Exception as e: # pylint: disable=broad-except + self.logger.error("BLAST searcher failed: %s", e) + return None - if not blast_record.alignments: - logger.info("No BLAST hits found for the given sequence.") - return None + if not blast_record.alignments: + self.logger.info("No BLAST hits found for the given sequence.") + return None - best_alignment = blast_record.alignments[0] - best_hsp = best_alignment.hsps[0] - if best_hsp.expect > threshold: - logger.info("No BLAST hits below the threshold E-value.") - return None - hit_id = best_alignment.hit_id + best_alignment = blast_record.alignments[0] + best_hsp = best_alignment.hsps[0] + if best_hsp.expect > threshold: + self.logger.info("No BLAST hits below the threshold E-value.") + return None - # like sp|P01308.1|INS_HUMAN - accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id + # like sp|P01308.1|INS_HUMAN + hit_id = best_alignment.hit_id + accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id return self.get_by_accession(accession) def _local_blast(self, seq: str, threshold: float) -> Optional[str]: """ Perform local BLAST search using local BLAST database. - :param seq: The protein sequence. - :param threshold: E-value threshold for BLAST searcher. - :return: The accession number of the best hit or None if not found. + Optimized with multi-threading and faster output format. """ try: - with tempfile.NamedTemporaryFile( - mode="w+", suffix=".fa", delete=False - ) as tmp: + with tempfile.NamedTemporaryFile(mode="w+", suffix=".fa", delete=False) as tmp: tmp.write(f">query\n{seq}\n") tmp_name = tmp.name + # Optimized BLAST command with: + # - num_threads: Use multiple threads for faster search + # - outfmt 6 sacc: Only return accession (minimal output) + # - max_target_seqs 1: Only need the best hit + # - evalue: Threshold for significance cmd = [ "blastp", "-db", @@ -191,17 +204,33 @@ def _local_blast(self, seq: str, threshold: float) -> Optional[str]: str(threshold), "-max_target_seqs", "1", + "-num_threads", + str(self.blast_num_threads), "-outfmt", - "6 sacc", # only return accession + "6 sacc", # Only accession, tab-separated ] - logger.debug("Running local blastp: %s", " ".join(cmd)) - out = subprocess.check_output(cmd, text=True).strip() + self.logger.debug("Running local blastp (threads=%d): %s", + self.blast_num_threads, " ".join(cmd)) + + # Run BLAST with timeout to avoid hanging + try: + out = subprocess.check_output( + cmd, + text=True, + timeout=300, # 5 minute timeout for BLAST search + stderr=subprocess.DEVNULL # Suppress BLAST warnings to reduce I/O + ).strip() + except subprocess.TimeoutExpired: + self.logger.warning("BLAST search timed out after 5 minutes for sequence") + os.remove(tmp_name) + return None + os.remove(tmp_name) if out: return out.split("\n", maxsplit=1)[0] return None except Exception as exc: # pylint: disable=broad-except - logger.error("Local blastp failed: %s", exc) + self.logger.error("Local blastp failed: %s", exc) return None @retry( @@ -222,11 +251,11 @@ async def search( # auto detect query type if not query or not isinstance(query, str): - logger.error("Empty or non-string input.") + self.logger.error("Empty or non-string input.") return None query = query.strip() - logger.debug("UniProt searcher query: %s", query) + self.logger.debug("UniProt searcher query: %s", query) loop = asyncio.get_running_loop() @@ -234,13 +263,23 @@ async def search( if query.startswith(">") or re.fullmatch( r"[ACDEFGHIKLMNPQRSTVWY\s]+", query, re.I ): - async with _blast_lock: + # Only use lock for network BLAST (NCBIWWW), local BLAST can run in parallel + if self.use_local_blast: + # Local BLAST can run in parallel, no lock needed result = await loop.run_in_executor( _get_pool(), self.get_by_fasta, query, threshold ) + else: + # Network BLAST needs lock to respect rate limits + async with _blast_lock: + result = await loop.run_in_executor( + _get_pool(), self.get_by_fasta, query, threshold + ) # check if accession number - elif re.fullmatch(r"[A-NR-Z0-9]{6,10}", query, re.I): + # UniProt accession IDs: 6-10 characters, must start with a letter + # Format: [A-Z][A-Z0-9]{5,9} (6-10 chars total: 1 letter + 5-9 alphanumeric) + elif re.fullmatch(r"[A-Z][A-Z0-9]{5,9}", query, re.I): result = await loop.run_in_executor( _get_pool(), self.get_by_accession, query ) diff --git a/graphgen/models/storage/kv/json_storage.py b/graphgen/models/storage/kv/json_storage.py index aa7c6f42..4af8d1e5 100644 --- a/graphgen/models/storage/kv/json_storage.py +++ b/graphgen/models/storage/kv/json_storage.py @@ -1,5 +1,6 @@ import os from dataclasses import dataclass +from typing import Iterator, Tuple from graphgen.bases.base_storage import BaseKVStorage from graphgen.utils import load_json, write_json @@ -42,6 +43,42 @@ def get_by_ids(self, ids, fields=None) -> list: def get_all(self) -> dict[str, dict]: return self._data + def iter_items(self) -> Iterator[Tuple[str, dict]]: + """ + Iterate over all items without loading everything into memory at once. + Returns an iterator of (key, value) tuples. + """ + for key, value in self._data.items(): + yield key, value + + def get_batch(self, keys: list[str]) -> dict[str, dict]: + """ + Get a batch of items by their keys. + + :param keys: List of keys to retrieve. + :return: Dictionary of {key: value} for the requested keys. + """ + return {key: self._data.get(key) for key in keys if key in self._data} + + def iter_batches(self, batch_size: int = 10000) -> Iterator[dict[str, dict]]: + """ + Iterate over items in batches to avoid loading everything into memory. + + :param batch_size: Number of items per batch. + :return: Iterator of dictionaries, each containing up to batch_size items. + """ + batch = {} + count = 0 + for key, value in self._data.items(): + batch[key] = value + count += 1 + if count >= batch_size: + yield batch + batch = {} + count = 0 + if batch: + yield batch + def filter_keys(self, data: list[str]) -> set[str]: return {s for s in data if s not in self._data} diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py index 64c78af5..5bb1261a 100644 --- a/graphgen/operators/__init__.py +++ b/graphgen/operators/__init__.py @@ -6,7 +6,7 @@ from .partition import PartitionService from .quiz import QuizService from .read import read -from .search import search_all +from .search import SearchService operators = { "read": read, @@ -15,7 +15,7 @@ "quiz": QuizService, "judge": JudgeService, "extract": ExtractService, - "search": search_all, + "search": SearchService, "partition": PartitionService, "generate": GenerateService, } diff --git a/graphgen/operators/build_kg/build_kg_service.py b/graphgen/operators/build_kg/build_kg_service.py index 532cfc79..220d7a28 100644 --- a/graphgen/operators/build_kg/build_kg_service.py +++ b/graphgen/operators/build_kg/build_kg_service.py @@ -8,6 +8,7 @@ from graphgen.utils import logger from .build_mm_kg import build_mm_kg +from .build_omics_kg import build_omics_kg from .build_text_kg import build_text_kg @@ -41,6 +42,9 @@ def build_kg(self, chunks: List[Chunk]) -> None: for chunk in chunks if chunk.type in ("image", "video", "table", "formula") ] + omics_chunks = [ + chunk for chunk in chunks if chunk.type in ("dna", "rna", "protein") + ] if len(text_chunks) == 0: logger.info("All text chunks are already in the storage") @@ -61,5 +65,17 @@ def build_kg(self, chunks: List[Chunk]) -> None: kg_instance=self.graph_storage, chunks=mm_chunks, ) + if len(omics_chunks) == 0: + logger.info("All omics chunks are already in the storage") + else: + logger.info( + "[Omics Entity and Relation Extraction] processing %d chunks (DNA/RNA/protein)...", + len(omics_chunks), + ) + build_omics_kg( + llm_client=self.llm_client, + kg_instance=self.graph_storage, + chunks=omics_chunks, + ) self.graph_storage.index_done_callback() diff --git a/graphgen/operators/build_kg/build_omics_kg.py b/graphgen/operators/build_kg/build_omics_kg.py new file mode 100644 index 00000000..503bb7eb --- /dev/null +++ b/graphgen/operators/build_kg/build_omics_kg.py @@ -0,0 +1,51 @@ +from collections import defaultdict +from typing import List + +from graphgen.bases import BaseLLMWrapper +from graphgen.bases.base_storage import BaseGraphStorage +from graphgen.bases.datatypes import Chunk +from graphgen.models import OmicsKGBuilder +from graphgen.utils import run_concurrent + + +def build_omics_kg( + llm_client: BaseLLMWrapper, + kg_instance: BaseGraphStorage, + chunks: List[Chunk], +): + """ + Build knowledge graph from multi-omics chunks (DNA, RNA, protein). + + :param llm_client: Synthesizer LLM model to extract entities and relationships + :param kg_instance: Graph storage instance + :param chunks: List of omics chunks + :return: None + """ + kg_builder = OmicsKGBuilder(llm_client=llm_client, max_loop=3) + + results = run_concurrent( + kg_builder.extract, + chunks, + desc="[2/4] Extracting entities and relationships from omics chunks", + unit="chunk", + ) + + nodes = defaultdict(list) + edges = defaultdict(list) + for n, e in results: + for k, v in n.items(): + nodes[k].extend(v) + for k, v in e.items(): + edges[tuple(sorted(k))].extend(v) + + run_concurrent( + lambda kv: kg_builder.merge_nodes(kv, kg_instance=kg_instance), + list(nodes.items()), + desc="Inserting omics entities into storage", + ) + + run_concurrent( + lambda kv: kg_builder.merge_edges(kv, kg_instance=kg_instance), + list(edges.items()), + desc="Inserting omics relationships into storage", + ) diff --git a/graphgen/operators/generate/generate_service.py b/graphgen/operators/generate/generate_service.py index db784d08..e8f447f5 100644 --- a/graphgen/operators/generate/generate_service.py +++ b/graphgen/operators/generate/generate_service.py @@ -7,6 +7,7 @@ AtomicGenerator, CoTGenerator, MultiHopGenerator, + OmicsQAGenerator, VQAGenerator, ) from graphgen.utils import logger, run_concurrent @@ -37,6 +38,8 @@ def __init__( self.generator = MultiHopGenerator(self.llm_client) elif self.method == "cot": self.generator = CoTGenerator(self.llm_client) + elif self.method == "omics_qa": + self.generator = OmicsQAGenerator(self.llm_client) elif self.method in ["vqa"]: self.generator = VQAGenerator(self.llm_client) else: diff --git a/graphgen/operators/judge/judge_service.py b/graphgen/operators/judge/judge_service.py index c7693aec..35797084 100644 --- a/graphgen/operators/judge/judge_service.py +++ b/graphgen/operators/judge/judge_service.py @@ -11,11 +11,11 @@ class JudgeService(BaseOperator): """Service for judging graph edges and nodes using a trainee LLM.""" - def __init__(self, working_dir: str = "cache", graph_backend: str = "kuzu"): + def __init__(self, working_dir: str = "cache"): super().__init__(working_dir=working_dir, op_name="judge_service") self.llm_client: BaseLLMWrapper = init_llm("trainee") self.graph_storage: BaseGraphStorage = init_storage( - backend=graph_backend, + backend="kuzu", working_dir=working_dir, namespace="graph", ) diff --git a/graphgen/operators/partition/partition_service.py b/graphgen/operators/partition/partition_service.py index 2289fec6..f7aae20a 100644 --- a/graphgen/operators/partition/partition_service.py +++ b/graphgen/operators/partition/partition_service.py @@ -2,6 +2,7 @@ import os from typing import Iterable +import numpy as np import pandas as pd from graphgen.bases import BaseGraphStorage, BaseKVStorage, BaseOperator, BaseTokenizer @@ -127,7 +128,7 @@ def _pre_tokenize(self) -> None: self.kg_instance.index_done_callback() logger.info("Pre-tokenization completed.") - def _attach_additional_data_to_node(self, batch: tuple) -> tuple: + def _attach_additional_data_to_node(self, batch: tuple) -> tuple: # pylint: disable=too-many-branches,too-many-statements """ Attach additional data from chunk_storage to nodes in the batch. :param batch: tuple of (nodes_data, edges_data) @@ -146,6 +147,9 @@ def _attach_additional_data_to_node(self, batch: tuple) -> tuple: if sid.strip() ] + if not source_ids: + continue + # Handle images if "image" in entity_type: image_chunks = [ @@ -160,4 +164,125 @@ def _attach_additional_data_to_node(self, batch: tuple) -> tuple: node_data["image_data"] = json.loads(image_chunks[0]["content"]) logger.debug("Attached image data to node %s", node_id) + # Handle omics data (protein/dna/rna) + molecule_type = None + if entity_type in ("protein", "dna", "rna"): + molecule_type = entity_type + else: + # Infer from source_id prefix + for sid in source_ids: + sid_lower = sid.lower() + if sid_lower.startswith("protein-"): + molecule_type = "protein" + break + if sid_lower.startswith("dna-"): + molecule_type = "dna" + break + if sid_lower.startswith("rna-"): + molecule_type = "rna" + break + + if molecule_type: + omics_chunks = [ + data + for sid in source_ids + if (data := self.chunk_storage.get_by_id(sid)) + ] + + if not omics_chunks: + logger.warning( + "No chunks found for node %s (type: %s) with source_ids: %s", + node_id, molecule_type, source_ids + ) + continue + + def get_chunk_value(chunk: dict, field: str): + # First check root level of chunk + if field in chunk: + return chunk[field] + # Then check metadata if it exists and is a dict + chunk_metadata = chunk.get("metadata") + if isinstance(chunk_metadata, dict) and field in chunk_metadata: + return chunk_metadata[field] + return None + + # Group chunks by molecule type to preserve all types of sequences + chunks_by_type = {"dna": [], "rna": [], "protein": []} + for chunk in omics_chunks: + chunk_id = chunk.get("_chunk_id", "").lower() + if chunk_id.startswith("dna-"): + chunks_by_type["dna"].append(chunk) + elif chunk_id.startswith("rna-"): + chunks_by_type["rna"].append(chunk) + elif chunk_id.startswith("protein-"): + chunks_by_type["protein"].append(chunk) + + # Field mappings for each molecule type + field_mapping = { + "protein": [ + "protein_name", "gene_names", "organism", "function", + "sequence", "id", "database", "entry_name", "uniprot_id" + ], + "dna": [ + "gene_name", "gene_description", "organism", "chromosome", + "genomic_location", "function", "gene_type", "sequence", + "id", "database" + ], + "rna": [ + "rna_type", "description", "organism", "related_genes", + "gene_name", "so_term", "sequence", "id", "database", + "rnacentral_id" + ], + } + + # Extract and store captions for each molecule type + for mol_type in ["dna", "rna", "protein"]: + type_chunks = chunks_by_type[mol_type] + if not type_chunks: + continue + + # Use the first chunk of this type + type_chunk = type_chunks[0] + caption = {} + + # Extract all relevant fields for this molecule type + for field in field_mapping.get(mol_type, []): + value = get_chunk_value(type_chunk, field) + # Handle numpy arrays properly - check size instead of truthiness + if isinstance(value, np.ndarray): + if value.size > 0: + caption[field] = value.tolist() # Convert to list for compatibility + elif value: # For other types, use normal truthiness check + caption[field] = value + + # Store caption if it has any data + if caption: + caption_key = f"{mol_type}_caption" + node_data[caption_key] = caption + logger.debug("Stored %s caption for node %s with %d fields", mol_type, node_id, len(caption)) + + # For backward compatibility, also attach sequence and other fields from the primary molecule type + # Use the detected molecule_type or default to the first available type + primary_chunk = None + if chunks_by_type.get(molecule_type): + primary_chunk = chunks_by_type[molecule_type][0] + elif chunks_by_type["dna"]: + primary_chunk = chunks_by_type["dna"][0] + elif chunks_by_type["rna"]: + primary_chunk = chunks_by_type["rna"][0] + elif chunks_by_type["protein"]: + primary_chunk = chunks_by_type["protein"][0] + else: + primary_chunk = omics_chunks[0] + + # Attach sequence if not already present (for backward compatibility) + if "sequence" not in node_data: + sequence = get_chunk_value(primary_chunk, "sequence") + # Handle numpy arrays properly + if isinstance(sequence, np.ndarray): + if sequence.size > 0: + node_data["sequence"] = sequence.tolist() # Convert to list for compatibility + elif sequence: # For other types, use normal truthiness check + node_data["sequence"] = sequence + return nodes_data, edges_data diff --git a/graphgen/operators/search/__init__.py b/graphgen/operators/search/__init__.py index 3d90f12a..47144c77 100644 --- a/graphgen/operators/search/__init__.py +++ b/graphgen/operators/search/__init__.py @@ -1 +1 @@ -from .search_all import search_all +from .search_service import SearchService diff --git a/graphgen/operators/search/search_all.py b/graphgen/operators/search/search_all.py deleted file mode 100644 index 6017cfee..00000000 --- a/graphgen/operators/search/search_all.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -To use Google Web Search API, -follow the instructions [here](https://developers.google.com/custom-search/v1/overview) -to get your Google searcher api key. - -To use Bing Web Search API, -follow the instructions [here](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) -and obtain your Bing subscription key. -""" - - -from graphgen.utils import logger, run_concurrent - - -async def search_all( - seed_data: dict, - search_config: dict, -) -> dict: - """ - Perform searches across multiple search types and aggregate the results. - :param seed_data: A dictionary containing seed data with entity names. - :param search_config: A dictionary specifying which data sources to use for searching. - :return: A dictionary with - """ - - results = {} - data_sources = search_config.get("data_sources", []) - - for data_source in data_sources: - data = list(seed_data.values()) - data = [d["content"] for d in data if "content" in d] - data = list(set(data)) # Remove duplicates - - if data_source == "uniprot": - from graphgen.models import UniProtSearch - - uniprot_search_client = UniProtSearch( - **search_config.get("uniprot_params", {}) - ) - - uniprot_results = await run_concurrent( - uniprot_search_client.search, - data, - desc="Searching UniProt database", - unit="keyword", - ) - results[data_source] = uniprot_results - - elif data_source == "ncbi": - from graphgen.models import NCBISearch - - ncbi_search_client = NCBISearch( - **search_config.get("ncbi_params", {}) - ) - - ncbi_results = await run_concurrent( - ncbi_search_client.search, - data, - desc="Searching NCBI database", - unit="keyword", - ) - results[data_source] = ncbi_results - - elif data_source == "rnacentral": - from graphgen.models import RNACentralSearch - - rnacentral_search_client = RNACentralSearch( - **search_config.get("rnacentral_params", {}) - ) - - rnacentral_results = await run_concurrent( - rnacentral_search_client.search, - data, - desc="Searching RNAcentral database", - unit="keyword", - ) - results[data_source] = rnacentral_results - - else: - logger.error("Data source %s not supported.", data_source) - continue - - return results diff --git a/graphgen/operators/search/search_service.py b/graphgen/operators/search/search_service.py new file mode 100644 index 00000000..7cc53dad --- /dev/null +++ b/graphgen/operators/search/search_service.py @@ -0,0 +1,428 @@ +""" +To use Google Web Search API, +follow the instructions [here](https://developers.google.com/custom-search/v1/overview) +to get your Google searcher api key. + +To use Bing Web Search API, +follow the instructions [here](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) +and obtain your Bing subscription key. +""" + +import pandas as pd +import numpy as np + +from graphgen.bases import BaseOperator +from graphgen.utils import create_event_loop, run_concurrent + + +class SearchService(BaseOperator): + """ + Service class for performing searches across multiple data sources. + Provides search functionality for UniProt, NCBI, and RNAcentral databases. + """ + + def __init__( + self, + working_dir: str = "cache", + data_sources: list = None, + ncbi_params: dict = None, + uniprot_params: dict = None, + rnacentral_params: dict = None, + save_interval: int = 1000, + **kwargs, + ): + super().__init__(working_dir=working_dir, op_name="search_service") + self.working_dir = working_dir + + # Build search_config dictionary from parameters + self.search_config = { + "data_sources": data_sources or [], + } + + if ncbi_params: + self.search_config["ncbi_params"] = ncbi_params + if uniprot_params: + self.search_config["uniprot_params"] = uniprot_params + if rnacentral_params: + self.search_config["rnacentral_params"] = rnacentral_params + + self.save_interval = save_interval + self.search_storage = None # Optional: can be initialized if needed for saving intermediate results + + async def _perform_searches(self, seed_data: dict) -> dict: + """ + Internal method to perform searches across multiple search types and aggregate the results. + :param seed_data: A dictionary containing seed data with entity names. + :return: A dictionary with search results + """ + results = {} + data_sources = self.search_config.get("data_sources", []) + + for data_source in data_sources: + data = list(seed_data.values()) + data = [d["content"] for d in data if "content" in d] + data = list(set(data)) # Remove duplicates + + # Prepare save callback for this data source + def make_save_callback(source_name): + def save_callback(intermediate_results, completed_count): + """Save intermediate search results.""" + if self.search_storage is None: + return + + # Convert results list to dict format + # Results are tuples of (query, result_dict) or just result_dict + batch_results = {} + for result in intermediate_results: + if result is None: + continue + # Check if result is a dict with _search_query key + if isinstance(result, dict) and "_search_query" in result: + query = result["_search_query"] + # Create a key for the result (using query as key) + key = f"{source_name}:{query}" + batch_results[key] = result + elif isinstance(result, dict): + # If no _search_query, use a generated key + key = f"{source_name}:{completed_count}" + batch_results[key] = result + + if batch_results: + # Filter out already existing keys + new_keys = self.search_storage.filter_keys(list(batch_results.keys())) + new_results = {k: v for k, v in batch_results.items() if k in new_keys} + if new_results: + self.search_storage.upsert(new_results) + self.search_storage.index_done_callback() + self.logger.debug("Saved %d intermediate results for %s", len(new_results), source_name) + + return save_callback + + if data_source == "uniprot": + from graphgen.models import UniProtSearch + + uniprot_params = self.search_config.get("uniprot_params", {}).copy() + # Get max_concurrent from config before passing params to constructor + max_concurrent = uniprot_params.pop("max_concurrent", None) + + uniprot_search_client = UniProtSearch( + working_dir=self.working_dir, + **uniprot_params + ) + + uniprot_results = await run_concurrent( + uniprot_search_client.search, + data, + desc="Searching UniProt database", + unit="keyword", + save_interval=self.save_interval if self.save_interval > 0 else 0, + save_callback=( + make_save_callback("uniprot") + if self.search_storage and self.save_interval > 0 + else None + ), + max_concurrent=max_concurrent, + ) + results[data_source] = uniprot_results + + elif data_source == "ncbi": + from graphgen.models import NCBISearch + + ncbi_params = self.search_config.get("ncbi_params", {}).copy() + # Get max_concurrent from config before passing params to constructor + max_concurrent = ncbi_params.pop("max_concurrent", None) + + ncbi_search_client = NCBISearch( + working_dir=self.working_dir, + **ncbi_params + ) + + ncbi_results = await run_concurrent( + ncbi_search_client.search, + data, + desc="Searching NCBI database", + unit="keyword", + save_interval=self.save_interval if self.save_interval > 0 else 0, + save_callback=( + make_save_callback("ncbi") + if self.search_storage and self.save_interval > 0 + else None + ), + max_concurrent=max_concurrent, + ) + results[data_source] = ncbi_results + + elif data_source == "rnacentral": + from graphgen.models import RNACentralSearch + + rnacentral_params = self.search_config.get("rnacentral_params", {}).copy() + # Get max_concurrent from config before passing params to constructor + max_concurrent = rnacentral_params.pop("max_concurrent", None) + + rnacentral_search_client = RNACentralSearch( + working_dir=self.working_dir, + **rnacentral_params + ) + + rnacentral_results = await run_concurrent( + rnacentral_search_client.search, + data, + desc="Searching RNAcentral database", + unit="keyword", + save_interval=self.save_interval if self.save_interval > 0 else 0, + save_callback=( + make_save_callback("rnacentral") + if self.search_storage and self.save_interval > 0 + else None + ), + max_concurrent=max_concurrent, + ) + results[data_source] = rnacentral_results + + else: + self.logger.error("Data source %s not supported.", data_source) + continue + + return results + + def _is_already_searched(self, doc: dict) -> bool: + """ + Check if a document already contains search results. + + :param doc: Document dictionary + :return: True if document appears to already contain search results + """ + # Check for data_source field (added by search_service) + if "data_source" in doc and doc["data_source"]: + return True + + # Check for database field (added by search operations) + if "database" in doc and doc["database"] in ["UniProt", "NCBI", "RNAcentral"]: + # Also check for molecule_type to confirm it's a search result + if "molecule_type" in doc and doc["molecule_type"] in ["DNA", "RNA", "protein"]: + return True + + # Check for search-specific fields that indicate search results + search_indicators = [ + "uniprot_id", "entry_name", # UniProt + "gene_id", "gene_name", "chromosome", # NCBI + "rnacentral_id", "rna_type", # RNAcentral + ] + if any(indicator in doc for indicator in search_indicators): + # Make sure it's not just metadata by checking for database or molecule_type + if "database" in doc or "molecule_type" in doc: + return True + + return False + + @staticmethod + def _clean_value(v): + """Recursively convert numpy arrays and other problematic types to Python-native types.""" + if isinstance(v, np.ndarray): + return v.tolist() + if isinstance(v, (list, tuple)): + return [SearchService._clean_value(item) for item in v] + if isinstance(v, dict): + return {k: SearchService._clean_value(val) for k, val in v.items()} + return v + + def _normalize_searched_data(self, doc: dict) -> dict: # pylint: disable=too-many-branches + """ + Normalize a document that already contains search results to the expected format. + + :param doc: Document dictionary with search results + :return: Normalized document dictionary + """ + # Ensure required fields exist + doc_id = doc.get("_doc_id") + if not doc_id: + # Generate doc_id from id or other fields + raw_doc_id = doc.get("id") or doc.get("_search_query") or f"doc-{hash(str(doc))}" + doc_id = str(raw_doc_id) + + # Ensure doc_id starts with "doc-" prefix + if not doc_id.startswith("doc-"): + doc_id = f"doc-{doc_id}" + + # Determine document type from molecule_type or existing type + doc_type = doc.get("type", "text") + if doc_type == "text" and "molecule_type" in doc: + molecule_type = doc.get("molecule_type", "").lower() + if molecule_type in ["dna", "rna", "protein"]: + doc_type = molecule_type + + # Ensure data_source field exists + data_source = doc.get("data_source") + if not data_source: + # Infer from database field + database = doc.get("database", "").lower() + if "uniprot" in database: + data_source = "uniprot" + elif "ncbi" in database: + data_source = "ncbi" + elif "rnacentral" in database or "rna" in database: + data_source = "rnacentral" + + # Build or preserve content field + content = doc.get("content") + if not content or content.strip() == "": + # Build content from available fields if missing + content_parts = [] + if doc.get("title"): + content_parts.append(f"Title: {doc['title']}") + if doc.get("description"): + content_parts.append(f"Description: {doc['description']}") + if doc.get("function"): + func = doc["function"] + if isinstance(func, list): + func = ", ".join(str(f) for f in func) + content_parts.append(f"Function: {func}") + if doc.get("sequence"): + content_parts.append(f"Sequence: {doc['sequence']}") + + if not content_parts: + # Fallback: create content from key fields + key_fields = ["protein_name", "gene_name", "gene_description", "organism"] + for field in key_fields: + if field in doc and doc[field]: + content_parts.append(f"{field}: {doc[field]}") + + content = "\n".join(content_parts) if content_parts else str(doc) + + # Create normalized row + normalized_doc = { + "_doc_id": doc_id, + "type": doc_type, + "content": content, + "data_source": data_source, + **doc, # Include all original fields for metadata + } + + return normalized_doc + + def process(self, batch: pd.DataFrame) -> pd.DataFrame: # pylint: disable=too-many-branches + """ + Process a batch of documents and perform searches. + This is the Ray Data operator interface. + + If input data already contains search results (detected by presence of + data_source, database, or search-specific fields), the search step is + skipped and the data is normalized and returned directly. + + :param batch: DataFrame containing documents with at least '_doc_id' and 'content' columns + :return: DataFrame containing search results + """ + # Convert DataFrame to dictionary format + docs = batch.to_dict(orient="records") + + # Check if data already contains search results + already_searched = all(self._is_already_searched(doc) for doc in docs if doc) + + if already_searched: + # Data already contains search results, normalize and return directly + self.logger.info( + "Input data already contains search results. " + "Skipping search step and normalizing data." + ) + result_rows = [] + for doc in docs: + if not doc: + continue + normalized_doc = self._normalize_searched_data(doc) + result_rows.append(normalized_doc) + + if not result_rows: + self.logger.warning("No documents found in batch") + return pd.DataFrame(columns=["_doc_id", "type", "content", "data_source"]) + + return pd.DataFrame(result_rows) + + # Data doesn't contain search results, perform search as usual + seed_data = {doc.get("_doc_id", f"doc-{i}"): doc for i, doc in enumerate(docs)} + + # Perform searches asynchronously + loop, created = create_event_loop() + try: + if loop.is_running(): + # If loop is already running, we can't use run_until_complete + # This shouldn't happen in normal usage, but handle it gracefully + raise RuntimeError( + "Cannot use process when event loop is already running. " + "This is likely a Ray worker configuration issue." + ) + search_results = loop.run_until_complete( + self._perform_searches(seed_data) + ) + finally: + # Only close the loop if we created it + if created: + loop.close() + + # Convert search_results from {data_source: [results]} to DataFrame + # Each result becomes a document row compatible with chunk service + result_rows = [] + + for data_source, result_list in search_results.items(): + if not isinstance(result_list, list): + continue + + for result in result_list: + if result is None: + continue + + # Convert search result to document format expected by chunk service + # Build content from available fields + content_parts = [] + if result.get("title"): + content_parts.append(f"Title: {result['title']}") + if result.get("description"): + content_parts.append(f"Description: {result['description']}") + if result.get("function"): + content_parts.append(f"Function: {result['function']}") + if result.get("sequence"): + content_parts.append(f"Sequence: {result['sequence']}") + + # If no content parts, use a default or combine all fields + if not content_parts: + # Fallback: create content from all string fields + content_parts = [ + f"{k}: {v}" + for k, v in result.items() + if isinstance(v, (str, int, float)) and k != "_search_query" + ] + + content = "\n".join(content_parts) if content_parts else str(result) + + # Determine document type from molecule_type or default to "text" + doc_type = result.get("molecule_type", "text").lower() + if doc_type not in ["text", "dna", "rna", "protein"]: + doc_type = "text" + + # Convert to string to handle Ray Data ListElement and other types + raw_doc_id = result.get("id") or result.get("_search_query") or f"search-{len(result_rows)}" + doc_id = str(raw_doc_id) + + # Ensure doc_id starts with "doc-" prefix + if not doc_id.startswith("doc-"): + doc_id = f"doc-{doc_id}" + + # Convert numpy arrays and complex types to Python-native types + # to avoid Ray Data tensor extension casting issues + cleaned_result = {k: self._clean_value(v) for k, v in result.items()} + + # Create document row with all result fields plus required fields + row = { + "_doc_id": doc_id, + "type": doc_type, + "content": content, + "data_source": data_source, + **cleaned_result, # Include all original result fields for metadata + } + result_rows.append(row) + + if not result_rows: + self.logger.warning("No search results generated for this batch") + # Return empty DataFrame with expected structure + return pd.DataFrame(columns=["_doc_id", "type", "content", "data_source"]) + + return pd.DataFrame(result_rows) diff --git a/graphgen/run.py b/graphgen/run.py index a1b65364..179efc7d 100644 --- a/graphgen/run.py +++ b/graphgen/run.py @@ -63,13 +63,21 @@ def main(): .joinpath("aggregated_config.yaml"), type=str, ) + parser.add_argument( + "--output_dir", + help="Output directory for GraphGen results.", + default=None, + type=str, + ) args = parser.parse_args() with open(args.config_file, "r", encoding="utf-8") as f: config = yaml.load(f, Loader=yaml.FullLoader) - working_dir = config.get("global_params", {}).get("working_dir", "cache") + working_dir = args.output_dir or config.get("global_params", {}).get( + "working_dir", "cache" + ) unique_id = int(time.time()) output_path = os.path.join(working_dir, "output", f"{unique_id}") set_working_dir(output_path) diff --git a/graphgen/templates/__init__.py b/graphgen/templates/__init__.py index 0940e910..99e297ee 100644 --- a/graphgen/templates/__init__.py +++ b/graphgen/templates/__init__.py @@ -6,9 +6,15 @@ ATOMIC_GENERATION_PROMPT, COT_GENERATION_PROMPT, MULTI_HOP_GENERATION_PROMPT, + OMICS_QA_GENERATION_PROMPT, VQA_GENERATION_PROMPT, ) -from .kg import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT, MMKG_EXTRACTION_PROMPT +from .kg import ( + KG_EXTRACTION_PROMPT, + KG_SUMMARIZATION_PROMPT, + MMKG_EXTRACTION_PROMPT, + OMICS_KG_EXTRACTION_PROMPT, +) from .question_generation import QUESTION_GENERATION_PROMPT from .search_judgement import SEARCH_JUDGEMENT_PROMPT from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT diff --git a/graphgen/templates/generation/__init__.py b/graphgen/templates/generation/__init__.py index b58c2b6c..22c3797a 100644 --- a/graphgen/templates/generation/__init__.py +++ b/graphgen/templates/generation/__init__.py @@ -2,4 +2,5 @@ from .atomic_generation import ATOMIC_GENERATION_PROMPT from .cot_generation import COT_GENERATION_PROMPT from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT +from .omics_qa_generation import OMICS_QA_GENERATION_PROMPT from .vqa_generation import VQA_GENERATION_PROMPT diff --git a/graphgen/templates/generation/omics_qa_generation.py b/graphgen/templates/generation/omics_qa_generation.py new file mode 100644 index 00000000..71f86b70 --- /dev/null +++ b/graphgen/templates/generation/omics_qa_generation.py @@ -0,0 +1,103 @@ +# pylint: disable=C0301 +OMICS_QA_TEMPLATE_EN: str = """You are a senior computational biologist specializing in multi-omics data analysis (genomics, transcriptomics, proteomics). Your task is to generate logically coherent, verifiable and non-hallucinated question-answer pairs for the given biological sample described by the provided ENTITIES and RELATIONSHIPS. +Use English as the output language. + +---Objectives--- +Create multiple sets of omics-centric QA pairs that satisfy the following: +1. Only ask about objectively existing facts in the provided data (e.g., gene names, sequence information, functional annotations, regulatory elements, structural features, experimental metadata, etc.). Avoid subjective or speculative questions. +2. Ensure that each question has a single, clear and verifiable answer that can be directly confirmed from the given entities/relationships. +3. Questions should cover diverse aspects: sequence, structure, function, interactions, regulation, experimental annotations, etc. +4. Avoid repetitive questions; each question must be unique and meaningful. +5. Use concise, unambiguous language; do not invent information beyond the provided data. + +---Instructions--- +1. Carefully analyse the supplied ENTITIES and RELATIONSHIPS to identify: + - Biological entities (genes, proteins, RNA molecules, regulatory elements, pathways, etc.) + - Sequence information (DNA sequences, RNA sequences, protein sequences) + - Functional annotations (gene function, protein function, RNA function, biological processes) + - Structural features (chromosomal location, genomic coordinates, domain structures, etc.) + - Regulatory relationships (transcription, translation, regulation, interaction) + - Experimental metadata (database IDs, organism, experimental methods, etc.) +2. Organise information logically: + - Start with sequence/primary structure information + - Proceed to functional annotations and biological roles + - Include regulatory relationships and interactions + - End with experimental context and metadata +3. Maintain scientific accuracy and consistent nomenclature (standard gene names, sequence identifiers, etc.). +4. Review each QA pair to guarantee logical consistency and absence of hallucination. + +################ +-ENTITIES- +################ +{entities} + +################ +-RELATIONSHIPS- +################ +{relationships} +################ +Directly output the generated QA pairs below. Do NOT copy any example questions, and do NOT include extraneous text. +IMPORTANT: Generate actual questions and answers, NOT placeholders. Do NOT include angle brackets or placeholder text like or . +Simply output your generated questions and answers in the following format: + +Question: [Your actual question here] +Answer: [Your actual answer here] + +Question: [Your actual question here] +Answer: [Your actual answer here] + +""" + +OMICS_QA_TEMPLATE_ZH: str = """你是一位资深的多组学数据计算生物学家(基因组学、转录组学、蛋白质组学)。你的任务是根据下述提供的实体与关系,为给定的生物样本生成逻辑连贯、可验证、无幻觉的中英双语问答对(这里仅输出中文)。 +使用中文作为输出语言。 + +---目标--- +创建多组以组学数据为中心的问答对,满足: +1. 仅询问数据中客观存在的事实(如基因名称、序列信息、功能注释、调控元件、结构特征、实验元数据等),避免主观或推测性问题。 +2. 每个问题必须有单一、明确且可直接验证的答案,答案必须能从给定实体/关系中直接确认。 +3. 问题需覆盖:序列、结构、功能、相互作用、调控、实验注释等多个维度,确保多样性与全面性。 +4. 避免重复提问,每个问题都独特且有意义。 +5. 语言简洁、无歧义,严禁编造超出给定数据的信息。 + +---说明--- +1. 仔细分析提供的实体与关系,识别: + - 生物实体(基因、蛋白质、RNA分子、调控元件、通路等) + - 序列信息(DNA序列、RNA序列、蛋白质序列) + - 功能注释(基因功能、蛋白质功能、RNA功能、生物学过程) + - 结构特征(染色体位置、基因组坐标、结构域等) + - 调控关系(转录、翻译、调控、相互作用) + - 实验元数据(数据库ID、生物体、实验方法等) +2. 按逻辑顺序组织信息: + - 从序列/一级结构信息入手 + - 再到功能注释和生物学作用 + - 包括调控关系和相互作用 + - 最后到实验背景和元数据 +3. 保持科学准确性,使用统一命名规范(标准基因名、序列标识符等)。 +4. 检查每对问答,确保逻辑一致且无幻觉。 + +################ +-实体- +################ +{entities} + +################ +-关系- +################ +{relationships} +################ +请直接在下方输出生成的问答对,不要复制任何示例,不要输出无关内容。 +重要提示:请生成实际的问题和答案,不要使用占位符。不要包含尖括号或占位符文本(如 <问题1> 或 <答案1>)。 +请直接按照以下格式输出生成的问题和答案: + +问题: [你生成的实际问题] +答案: [你生成的实际答案] + +问题: [你生成的实际问题] +答案: [你生成的实际答案] + +""" + +OMICS_QA_GENERATION_PROMPT = { + "en": OMICS_QA_TEMPLATE_EN, + "zh": OMICS_QA_TEMPLATE_ZH, +} diff --git a/graphgen/templates/kg/__init__.py b/graphgen/templates/kg/__init__.py index ea865ce6..e39c1408 100644 --- a/graphgen/templates/kg/__init__.py +++ b/graphgen/templates/kg/__init__.py @@ -1,3 +1,5 @@ from .kg_extraction import KG_EXTRACTION_PROMPT from .kg_summarization import KG_SUMMARIZATION_PROMPT from .mm_kg_extraction import MMKG_EXTRACTION_PROMPT +from .omics_kg_extraction import OMICS_KG_EXTRACTION_PROMPT +from .protein_kg_extraction import PROTEIN_KG_EXTRACTION_PROMPT diff --git a/graphgen/templates/kg/omics_kg_extraction.py b/graphgen/templates/kg/omics_kg_extraction.py new file mode 100644 index 00000000..d105dd38 --- /dev/null +++ b/graphgen/templates/kg/omics_kg_extraction.py @@ -0,0 +1,209 @@ +# pylint: disable=C0301 +TEMPLATE_EN: str = """You are a bioinformatics expert, skilled at analyzing biological sequences (DNA, RNA, protein) and their metadata to extract biological entities and their relationships. + +-Goal- +Given a biological sequence chunk (DNA, RNA, or protein) along with its metadata, identify all relevant biological entities and their relationships. +Use English as output language. + +-Steps- +1. Identify all biological entities. For each identified entity, extract the following information: +- entity_name: Name of the entity (gene name, protein name, RNA name, domain name, etc.), capitalized +- entity_type: One of the following types: [{entity_types}] +- entity_summary: Comprehensive summary of the entity's biological function, structure, or properties +Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *biologically related* to each other. +For each pair of related entities, extract the following information: +- source_entity: name of the source entity, as identified in step 1 +- target_entity: name of the target entity, as identified in step 1 +- relationship_summary: explanation of the biological relationship (e.g., encodes, transcribes, translates, interacts, regulates, homologous_to, located_in, etc.) +Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. Identify high-level key words that summarize the main biological concepts, functions, or themes. +Format the content-level key words as ("content_keywords"{tuple_delimiter}) + +4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. + +5. When finished, output {completion_delimiter} + +################ +-Examples- +################ +-Example 1- +Sequence Metadata: +################ +molecule_type: DNA +database: NCBI +id: NG_033923 +gene_name: BRCA1 +gene_description: BRCA1 DNA repair associated +organism: Homo sapiens +gene_type: protein-coding +chromosome: 17 +genomic_location: 43044295-43125483 +function: BRCA1 is a tumor suppressor gene involved in DNA repair +sequence_chunk: ATGCGATCGATCGATCG... (first 500bp of BRCA1 gene) +################ +Output: +("entity"{tuple_delimiter}"BRCA1"{tuple_delimiter}"gene"{tuple_delimiter}"BRCA1 is a protein-coding tumor suppressor gene located on chromosome 17 in humans, involved in DNA repair mechanisms."){record_delimiter} +("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"Human species, the organism in which BRCA1 gene is found."){record_delimiter} +("entity"{tuple_delimiter}"chromosome 17"{tuple_delimiter}"location"{tuple_delimiter}"Chromosome 17 is the chromosomal location of the BRCA1 gene in humans."){record_delimiter} +("entity"{tuple_delimiter}"DNA repair"{tuple_delimiter}"biological_process"{tuple_delimiter}"DNA repair is a biological process in which BRCA1 is involved as a tumor suppressor."){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"BRCA1 is a gene found in Homo sapiens."){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"chromosome 17"{tuple_delimiter}"BRCA1 is located on chromosome 17 in the human genome."){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"DNA repair"{tuple_delimiter}"BRCA1 is involved in DNA repair processes as a tumor suppressor gene."){record_delimiter} +("content_keywords"{tuple_delimiter}"tumor suppressor, DNA repair, genetic disease, cancer genetics"){completion_delimiter} + +-Example 2- +Sequence Metadata: +################ +molecule_type: RNA +database: RNAcentral +id: URS0000000001 +rna_type: miRNA +description: hsa-let-7a-1 microRNA +organism: Homo sapiens +related_genes: ["LIN28", "HMGA2"] +sequence_chunk: CUCCUUUGACGUUAGCGGCGGACGGGUUAGUAACACGUGGGUAACCUACCUAUAAGACUGGGAUAACUUCGGGAAACCGGAGCUAAUACCGGAUAAUAUUUCGAACCGCAUGGUUCGAUAGUGAAAGAUGGUUUUGCUAUCACUUAUAGAUGGACCCGCGCCGUAUUAGCUAGUUGGUAAGGUAACGGCUUACCAAGGCGACGAUACGUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGAACUGAGACACGGUCCAGACUCCUACGGGAGGCAGCAGGGG +################ +Output: +("entity"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"rna"{tuple_delimiter}"hsa-let-7a-1 is a microRNA (miRNA) found in Homo sapiens, involved in gene regulation."){record_delimiter} +("entity"{tuple_delimiter}"LIN28"{tuple_delimiter}"gene"{tuple_delimiter}"LIN28 is a gene related to hsa-let-7a-1 microRNA, involved in RNA processing and development."){record_delimiter} +("entity"{tuple_delimiter}"HMGA2"{tuple_delimiter}"gene"{tuple_delimiter}"HMGA2 is a gene related to hsa-let-7a-1 microRNA, involved in chromatin structure and gene expression."){record_delimiter} +("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"Human species, the organism in which hsa-let-7a-1 is found."){record_delimiter} +("entity"{tuple_delimiter}"microRNA"{tuple_delimiter}"rna_type"{tuple_delimiter}"MicroRNA is a type of small non-coding RNA involved in post-transcriptional gene regulation."){record_delimiter} +("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"hsa-let-7a-1 is a microRNA found in Homo sapiens."){record_delimiter} +("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"LIN28"{tuple_delimiter}"hsa-let-7a-1 microRNA is related to LIN28 gene, potentially regulating its expression."){record_delimiter} +("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"HMGA2"{tuple_delimiter}"hsa-let-7a-1 microRNA is related to HMGA2 gene, potentially regulating its expression."){record_delimiter} +("relationship"{tuple_delimiter}"hsa-let-7a-1"{tuple_delimiter}"microRNA"{tuple_delimiter}"hsa-let-7a-1 belongs to the microRNA class of RNA molecules."){record_delimiter} +("content_keywords"{tuple_delimiter}"microRNA, gene regulation, post-transcriptional control, RNA processing"){completion_delimiter} + +-Example 3- +Sequence Metadata: +################ +molecule_type: protein +database: UniProt +id: P01308 +protein_name: Insulin +organism: Homo sapiens +function: ["Regulates glucose metabolism", "Hormone signaling"] +sequence_chunk: MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN +################ +Output: +("entity"{tuple_delimiter}"Insulin"{tuple_delimiter}"protein"{tuple_delimiter}"Insulin is a protein hormone in Homo sapiens that regulates glucose metabolism and hormone signaling."){record_delimiter} +("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"Human species, the organism in which Insulin is produced."){record_delimiter} +("entity"{tuple_delimiter}"glucose metabolism"{tuple_delimiter}"biological_process"{tuple_delimiter}"Glucose metabolism is a biological process regulated by Insulin."){record_delimiter} +("entity"{tuple_delimiter}"hormone signaling"{tuple_delimiter}"biological_process"{tuple_delimiter}"Hormone signaling is a biological process in which Insulin participates as a signaling molecule."){record_delimiter} +("relationship"{tuple_delimiter}"Insulin"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"Insulin is a protein produced in Homo sapiens."){record_delimiter} +("relationship"{tuple_delimiter}"Insulin"{tuple_delimiter}"glucose metabolism"{tuple_delimiter}"Insulin regulates glucose metabolism in the body."){record_delimiter} +("relationship"{tuple_delimiter}"Insulin"{tuple_delimiter}"hormone signaling"{tuple_delimiter}"Insulin participates in hormone signaling pathways."){record_delimiter} +("content_keywords"{tuple_delimiter}"hormone, metabolism, glucose regulation, signaling pathway"){completion_delimiter} + +################ +-Real Data- +################ +Entity_types: {entity_types} +Sequence Metadata: {metadata_text} +Sequence Chunk: {sequence_chunk} +################ +Output: +""" + + +TEMPLATE_ZH: str = """你是一个生物信息学专家,擅长分析生物序列(DNA、RNA、蛋白质)及其元数据,提取生物实体及其关系。 + +-目标- +给定一个生物序列片段(DNA、RNA或蛋白质)及其元数据,识别所有相关的生物实体及其关系。 +使用中文作为输出语言。 + +-步骤- +1. 识别所有生物实体。对于每个识别的实体,提取以下信息: + - entity_name:实体的名称(基因名、蛋白质名、RNA名、功能域名等),首字母大写 + - entity_type:以下类型之一:[{entity_types}] + - entity_summary:实体生物学功能、结构或属性的全面总结 + 将每个实体格式化为("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +2. 从步骤1中识别的实体中,识别所有(源实体,目标实体)对,这些实体彼此之间*在生物学上相关*。 + 对于每对相关的实体,提取以下信息: + - source_entity:步骤1中识别的源实体名称 + - target_entity:步骤1中识别的目标实体名称 + - relationship_summary:生物学关系的解释(例如:编码、转录、翻译、相互作用、调控、同源、位于等) + 将每个关系格式化为("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +3. 识别总结主要生物学概念、功能或主题的高级关键词。 + 将内容级关键词格式化为("content_keywords"{tuple_delimiter}) + +4. 以中文返回步骤1和2中识别出的所有实体和关系的输出列表。使用**{record_delimiter}**作为列表分隔符。 + +5. 完成后,输出{completion_delimiter} + +################ +-示例- +################ +-示例 1- +序列元数据: +################ +molecule_type: DNA +database: NCBI +id: NG_033923 +gene_name: BRCA1 +gene_description: BRCA1 DNA repair associated +organism: Homo sapiens +gene_type: protein-coding +chromosome: 17 +genomic_location: 43044295-43125483 +function: BRCA1 is a tumor suppressor gene involved in DNA repair +sequence_chunk: ATGCGATCGATCGATCG... (BRCA1基因的前500bp) +################ +输出: +("entity"{tuple_delimiter}"BRCA1"{tuple_delimiter}"gene"{tuple_delimiter}"BRCA1是位于人类17号染色体上的蛋白质编码肿瘤抑制基因,参与DNA修复机制。"){record_delimiter} +("entity"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"organism"{tuple_delimiter}"人类,BRCA1基因所在的生物体。"){record_delimiter} +("entity"{tuple_delimiter}"17号染色体"{tuple_delimiter}"location"{tuple_delimiter}"17号染色体是BRCA1基因在人类基因组中的位置。"){record_delimiter} +("entity"{tuple_delimiter}"DNA修复"{tuple_delimiter}"biological_process"{tuple_delimiter}"DNA修复是BRCA1作为肿瘤抑制基因参与的生物学过程。"){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"Homo sapiens"{tuple_delimiter}"BRCA1是在人类中发现的基因。"){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"17号染色体"{tuple_delimiter}"BRCA1位于人类基因组的17号染色体上。"){record_delimiter} +("relationship"{tuple_delimiter}"BRCA1"{tuple_delimiter}"DNA修复"{tuple_delimiter}"BRCA1作为肿瘤抑制基因参与DNA修复过程。"){record_delimiter} +("content_keywords"{tuple_delimiter}"肿瘤抑制, DNA修复, 遗传疾病, 癌症遗传学"){completion_delimiter} + +################ +-真实数据- +################ +实体类型:{entity_types} +序列元数据:{metadata_text} +序列片段:{sequence_chunk} +################ +输出: +""" + + +CONTINUE_EN: str = """MANY entities and relationships were missed in the last extraction. \ +Add them below using the same format: +""" + +CONTINUE_ZH: str = """很多实体和关系在上一次的提取中可能被遗漏了。请在下面使用相同的格式添加它们:""" + +IF_LOOP_EN: str = """It appears some entities and relationships may have still been missed. \ +Answer YES | NO if there are still entities and relationships that need to be added. +""" + +IF_LOOP_ZH: str = """看起来可能仍然遗漏了一些实体和关系。如果仍有实体和关系需要添加,请回答YES | NO。""" + +OMICS_KG_EXTRACTION_PROMPT: dict = { + "en": { + "TEMPLATE": TEMPLATE_EN, + "CONTINUE": CONTINUE_EN, + "IF_LOOP": IF_LOOP_EN, + }, + "zh": { + "TEMPLATE": TEMPLATE_ZH, + "CONTINUE": CONTINUE_ZH, + "IF_LOOP": IF_LOOP_ZH, + }, + "FORMAT": { + "tuple_delimiter": "<|>", + "record_delimiter": "##", + "completion_delimiter": "<|COMPLETE|>", + "entity_types": "gene, rna, protein, organism, location, biological_process, rna_type, protein_domain, \ +mutation, pathway, disease, function, structure", + }, +} diff --git a/graphgen/utils/loop.py b/graphgen/utils/loop.py index 5f12fa5b..f0ab7dfd 100644 --- a/graphgen/utils/loop.py +++ b/graphgen/utils/loop.py @@ -1,9 +1,10 @@ import asyncio +from typing import Tuple from .log import logger -def create_event_loop() -> asyncio.AbstractEventLoop: +def create_event_loop() -> Tuple[asyncio.AbstractEventLoop, bool]: """ Ensure that there is always an event loop available. @@ -11,18 +12,25 @@ def create_event_loop() -> asyncio.AbstractEventLoop: it creates a new event loop and sets it as the current event loop. Returns: - asyncio.AbstractEventLoop: The current or newly created event loop. + Tuple[asyncio.AbstractEventLoop, bool]: The event loop and a flag + indicating if we created it (True) or it was already running (False). """ try: - # Try to get the current event loop - current_loop = asyncio.get_event_loop() - if current_loop.is_closed(): - raise RuntimeError("Event loop is closed.") - return current_loop - + # Try to get the running event loop (Python 3.7+) + running_loop = asyncio.get_running_loop() + # If we get here, there's already a running loop + return running_loop, False except RuntimeError: - # If no event loop exists or it is closed, create a new one - logger.info("Creating a new event loop in main thread.") - new_loop = asyncio.new_event_loop() - asyncio.set_event_loop(new_loop) - return new_loop + # No running loop, try to get the current event loop + try: + current_loop = asyncio.get_event_loop() + if current_loop.is_closed(): + raise RuntimeError("Event loop is closed.") from None + # Loop exists but not running, we can use it + return current_loop, False + except RuntimeError: + # No event loop exists, create a new one + logger.info("Creating a new event loop in main thread.") + new_loop = asyncio.new_event_loop() + asyncio.set_event_loop(new_loop) + return new_loop, True diff --git a/graphgen/utils/run_concurrent.py b/graphgen/utils/run_concurrent.py index d1a9b0e2..dfbfde32 100644 --- a/graphgen/utils/run_concurrent.py +++ b/graphgen/utils/run_concurrent.py @@ -1,5 +1,5 @@ import asyncio -from typing import Awaitable, Callable, List, TypeVar +from typing import Awaitable, Callable, List, Optional, TypeVar, Union from tqdm.asyncio import tqdm as tqdm_async @@ -17,28 +17,97 @@ def run_concurrent( *, desc: str = "processing", unit: str = "item", -) -> List[R]: + save_interval: int = 0, + save_callback: Optional[Callable[[List[R], int], None]] = None, + max_concurrent: Optional[int] = None, +) -> Union[List[R], Awaitable[List[R]]]: + """ + Run coroutines concurrently with optional periodic saving. + This function can be used in both sync and async contexts: + - In sync context: returns List[R] directly + - In async context: returns Awaitable[List[R]] (use with 'await') + :return: List of results (in sync context) or coroutine (in async context) + """ async def _run_all(): - tasks = [asyncio.create_task(coro_fn(item)) for item in items] + if not items: + return [] + # Use semaphore to limit concurrent tasks if max_concurrent is specified + semaphore = asyncio.Semaphore(max_concurrent) if max_concurrent is not None and max_concurrent > 0 else None + async def run_with_semaphore(item: T) -> R: + """Wrapper to apply semaphore if needed.""" + if semaphore: + async with semaphore: + return await coro_fn(item) + else: + return await coro_fn(item) + + # Create tasks with concurrency limit + if max_concurrent is not None and max_concurrent > 0: + # Use semaphore-controlled wrapper + tasks = [asyncio.create_task(run_with_semaphore(it)) for it in items] + else: + # Original behavior: create all tasks at once + tasks = [asyncio.create_task(coro_fn(it)) for it in items] + + completed_count = 0 results = [] + pending_save_results = [] pbar = tqdm_async(total=len(items), desc=desc, unit=unit) for future in asyncio.as_completed(tasks): try: result = await future results.append(result) - except Exception as e: + if save_interval > 0 and save_callback is not None: + pending_save_results.append(result) + except Exception as e: # pylint: disable=broad-except logger.exception("Task failed: %s", e) + # even if failed, record it to keep results consistent with tasks results.append(e) + completed_count += 1 pbar.update(1) + # Periodic save + if save_interval > 0 and save_callback is not None and completed_count % save_interval == 0: + try: + # Filter out exceptions before saving + valid_results = [res for res in pending_save_results if not isinstance(res, Exception)] + save_callback(valid_results, completed_count) + pending_save_results = [] # Clear after saving + logger.info("Saved intermediate results: %d/%d completed", completed_count, len(items)) + except Exception as e: + logger.warning("Failed to save intermediate results: %s", e) + pbar.close() + + # Save remaining results if any + if save_interval > 0 and save_callback is not None and pending_save_results: + try: + valid_results = [res for res in pending_save_results if not isinstance(res, Exception)] + save_callback(valid_results, completed_count) + logger.info("Saved final intermediate results: %d completed", completed_count) + except Exception as e: + logger.warning("Failed to save final intermediate results: %s", e) + + # filter out exceptions return [res for res in results if not isinstance(res, Exception)] - loop = create_event_loop() + # Check if we're in an async context (event loop is running) try: - return loop.run_until_complete(_run_all()) - finally: - loop.close() + _ = asyncio.get_running_loop() + # If we're in an async context, return the coroutine directly + # The caller should use 'await run_concurrent(...)' + return _run_all() + except RuntimeError: + # No running loop, we can create one and run until complete + if not items: + return [] + loop, created = create_event_loop() + try: + return loop.run_until_complete(_run_all()) + finally: + # Only close the loop if we created it + if created: + loop.close()