infer_from_cds in UTRFetcher does not work correctly

Currently, UTR region inference works only for non-spliced UTR regions:
https://github.com/kipoi/kipoiseq/blob/e67fab64ce5306b0f5a39557a93c768f5ab733f6/kipoiseq/extractors/gtf.py#L348-L368

**TODO:**
1) Generate test data with transcript that contains a spliced UTR, e.g.:
`tabix /s/genomes/GenBank/hg38/annotation/hg38.ensGene.gtf.gz chr22 | grep -i ENST00000263207 > kipoiseq/tests/data/chr22_ENST00000263207.gtf`
2) Simulate some variants:
  - `chr22_ENST00000263207_3UTR.vcf.gz`
  - `chr22_ENST00000263207_5UTR.vcf.gz`
3) Generate with `infer_from_cds=False`:
  - `chr22_ENST00000263207_3UTR.alt_seqs.txt`
  - `chr22_ENST00000263207_3UTR.ref_seq.txt`
  - `chr22_ENST00000263207_5UTR.alt_seqs.txt`
  - `chr22_ENST00000263207_5UTR.ref_seq.txt`
4) Update tests: https://github.com/kipoi/kipoiseq/blob/1d72daf7373d2b105a8c788016f1777ec47b6259/tests/extractors/test_protein.py#L323-L358

	# get start and end of cds for each transcript
	cds = CDSFetcher.get_cds_from_gtf(df=df, on_error_warn=on_error_warn) \
	.groupby('transcript_id') \
	.agg({'Start': min, 'End': max})

	# join cds start and end to utr df
	utr_df = df.query("Feature == 'transcript'") \
	.set_index('transcript_id') \
	.join(cds, rsuffix="_cds") \
	.dropna(subset=['Start_cds', 'End_cds'], axis=0)

	if feature_type.upper() == "5UTR":
	utr_df['Start'] = np.where(utr_df['Strand'] == '+', int(utr_df['Start']), int(utr_df['End_cds']))
	utr_df['End'] = np.where(utr_df['Strand'] == '+', int(utr_df['Start_cds']), int(utr_df['End']))
	utr_df['Feature'] = pd.Categorical("5UTR", categories = utr_df['Feature'])
	if feature_type.upper() == "3UTR":
	utr_df['Start'] = np.where(utr_df['Strand'] == '+', int(utr_df['End_cds']), int(utr_df['Start']))
	utr_df['End'] = np.where(utr_df['Strand'] == '+', int(utr_df['End']), int(utr_df['Start_cds']))
	utr_df['Feature'] = pd.Categorical("3UTR", categories = utr_df['Feature'])

	utr_df.drop(['Start_cds', 'End_cds'], axis=1, inplace=True)

	# chr22_fasta_file = 'tests/data/chr22.fa.gz'
	chr22_gtf_file = 'tests/data/chr22_ENST00000319363.gtf'


	# chr22_5UTR_vcf_file = 'tests/data/chr22_ENST00000319363_5UTR.vcf.gz'


	def test_5UTRFetcher__read_utr():
	utr5 = UTRFetcher._read_utr(chr22_gtf_file, feature_type="5UTR")

	assert utr5.shape == (1, 12)

	assert utr5.iloc[0].Chromosome == 'chr22'
	assert utr5.iloc[0].Start == 17565848
	assert utr5.iloc[0].End == 17565981
	assert utr5.iloc[0].Strand == "+"

	utr5_from_cds = UTRFetcher._read_utr(chr22_gtf_file, feature_type="5UTR", infer_from_cds=True)

	pd.testing.assert_frame_equal(left = utr5.drop(['exon_number', 'exon_id'], axis=1), right = utr5_from_cds.drop(['exon_number', 'exon_id'], axis=1), check_dtype=False)


	def test_3UTRFetcher__read_utr():
	utr3 = UTRFetcher._read_utr(chr22_gtf_file, feature_type="3UTR")

	assert utr3.shape == (1, 12)

	assert utr3.iloc[0].Chromosome == 'chr22'
	assert utr3.iloc[0].Start == 17590710
	assert utr3.iloc[0].End == 17596583
	assert utr3.iloc[0].Strand == "+"

	utr3_from_cds = UTRFetcher._read_utr(chr22_gtf_file, feature_type="3UTR", infer_from_cds=True)

	pd.testing.assert_frame_equal(left=utr3.drop(['exon_number', 'exon_id'], axis=1),
	right=utr3_from_cds.drop(['exon_number', 'exon_id'], axis=1), check_dtype=False)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

infer_from_cds in UTRFetcher does not work correctly #88

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

infer_from_cds in UTRFetcher does not work correctly #88

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions