-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
50 lines (39 loc) · 1.34 KB
/
utils.py
File metadata and controls
50 lines (39 loc) · 1.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from Bio import SeqIO
import numpy as np
def kmers(sequence):
"""
Generate all k-mers of a specified length from a given sequence.
A k-mer is a substring of length `k` extracted from the input sequence.
This function iterates through the sequence and returns a list of all possible
k-mers in the order they appear.
Parameters
----------
sequence : str
The input sequence from which to extract k-mers.
k : int
The length of each k-mer to be generated.
Returns
-------
list
A list of k-mers (substrings) of length `k` extracted from `sequence`.
Raises
------
ValueError
If `k` is greater than the length of `sequence` or if `k` is less than 1.
Examples
--------
>>> kmers("ATCG", 2)
['AT', 'TC', 'CG']
>>> kmers("ATCG", 3)
['ATC', 'TCG']
"""
kmers = []
for i in range(len(sequence) - 21 + 1):
kmers.append(sequence[i:i + 21])
return kmers
def one_hot_encode_kmer(kmer):
one_hot_map = {'A': [1, 0, 0, 0], 'T': [0, 1, 0, 0], 'C': [0, 0, 1, 0], 'G': [0, 0, 0, 1]}
return np.array([one_hot_map[base] for base in kmer]).flatten()
def extract_data_from_Fasta(file, path):
read = SeqIO.read(path+file, format='fasta')
return dict(name = read.name, description = read.description ,sequence = str(read.seq), size = len(read.seq))