-
Notifications
You must be signed in to change notification settings - Fork 14
Open
Description
Currently, UTR region inference works only for non-spliced UTR regions:
kipoiseq/kipoiseq/extractors/gtf.py
Lines 348 to 368 in e67fab6
| # get start and end of cds for each transcript | |
| cds = CDSFetcher.get_cds_from_gtf(df=df, on_error_warn=on_error_warn) \ | |
| .groupby('transcript_id') \ | |
| .agg({'Start': min, 'End': max}) | |
| # join cds start and end to utr df | |
| utr_df = df.query("Feature == 'transcript'") \ | |
| .set_index('transcript_id') \ | |
| .join(cds, rsuffix="_cds") \ | |
| .dropna(subset=['Start_cds', 'End_cds'], axis=0) | |
| if feature_type.upper() == "5UTR": | |
| utr_df['Start'] = np.where(utr_df['Strand'] == '+', int(utr_df['Start']), int(utr_df['End_cds'])) | |
| utr_df['End'] = np.where(utr_df['Strand'] == '+', int(utr_df['Start_cds']), int(utr_df['End'])) | |
| utr_df['Feature'] = pd.Categorical("5UTR", categories = utr_df['Feature']) | |
| if feature_type.upper() == "3UTR": | |
| utr_df['Start'] = np.where(utr_df['Strand'] == '+', int(utr_df['End_cds']), int(utr_df['Start'])) | |
| utr_df['End'] = np.where(utr_df['Strand'] == '+', int(utr_df['End']), int(utr_df['Start_cds'])) | |
| utr_df['Feature'] = pd.Categorical("3UTR", categories = utr_df['Feature']) | |
| utr_df.drop(['Start_cds', 'End_cds'], axis=1, inplace=True) |
TODO:
- Generate test data with transcript that contains a spliced UTR, e.g.:
tabix /s/genomes/GenBank/hg38/annotation/hg38.ensGene.gtf.gz chr22 | grep -i ENST00000263207 > kipoiseq/tests/data/chr22_ENST00000263207.gtf - Simulate some variants:
chr22_ENST00000263207_3UTR.vcf.gzchr22_ENST00000263207_5UTR.vcf.gz
- Generate with
infer_from_cds=False:
chr22_ENST00000263207_3UTR.alt_seqs.txtchr22_ENST00000263207_3UTR.ref_seq.txtchr22_ENST00000263207_5UTR.alt_seqs.txtchr22_ENST00000263207_5UTR.ref_seq.txt
- Update tests:
kipoiseq/tests/extractors/test_protein.py
Lines 323 to 358 in 1d72daf
# chr22_fasta_file = 'tests/data/chr22.fa.gz' chr22_gtf_file = 'tests/data/chr22_ENST00000319363.gtf' # chr22_5UTR_vcf_file = 'tests/data/chr22_ENST00000319363_5UTR.vcf.gz' def test_5UTRFetcher__read_utr(): utr5 = UTRFetcher._read_utr(chr22_gtf_file, feature_type="5UTR") assert utr5.shape == (1, 12) assert utr5.iloc[0].Chromosome == 'chr22' assert utr5.iloc[0].Start == 17565848 assert utr5.iloc[0].End == 17565981 assert utr5.iloc[0].Strand == "+" utr5_from_cds = UTRFetcher._read_utr(chr22_gtf_file, feature_type="5UTR", infer_from_cds=True) pd.testing.assert_frame_equal(left = utr5.drop(['exon_number', 'exon_id'], axis=1), right = utr5_from_cds.drop(['exon_number', 'exon_id'], axis=1), check_dtype=False) def test_3UTRFetcher__read_utr(): utr3 = UTRFetcher._read_utr(chr22_gtf_file, feature_type="3UTR") assert utr3.shape == (1, 12) assert utr3.iloc[0].Chromosome == 'chr22' assert utr3.iloc[0].Start == 17590710 assert utr3.iloc[0].End == 17596583 assert utr3.iloc[0].Strand == "+" utr3_from_cds = UTRFetcher._read_utr(chr22_gtf_file, feature_type="3UTR", infer_from_cds=True) pd.testing.assert_frame_equal(left=utr3.drop(['exon_number', 'exon_id'], axis=1), right=utr3_from_cds.drop(['exon_number', 'exon_id'], axis=1), check_dtype=False)
Metadata
Metadata
Assignees
Labels
No labels