coding-bootcamp/ProblemSet1.py at master · macadology/coding-bootcamp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# %% [markdown]
# # Problem Set 1
# When running the code in a new repl, you will need to install the following packages
# 1. biopython
# 1. logomaker
# 1. pandas
# In repl, on the left menu, look for Packages. Search and install the required packages.
# %%
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqUtils import GC
import numpy as np

# %% Code Setup
my_seq = Seq("AGTACGCAACTGGTA")
my_seq2 = Seq("AGTACACGCGCGCTGGTA", IUPAC.unambiguous_dna)
my_prot = Seq("AGTACACTGGTA", IUPAC.protein)

# %% [markdown]
# Complete the codes below

# %%
# my_seq's variable type, my_prot's variable type
type_of_my_seq =
type_of_my_prot =


# %%
# Join sequences my_seq and my_seq2 into my_seq3.
my_seq3 =


# %%
# Find the number of base T in my_seq
t_myseq =

# %%
# Find the GC content of a DNA given the function GC
gc_myseq =


# %%
# Find the geometric mean of base A in my_seq and my_seq2
gmean =

# %%
# Find the abundance of A, T, C and G in my_seq. Report the answer as a list of abundances.
abundance_my_seq =

# %%
# Find the abundance of A, T, C and G in my_seq. Report the answer as a numpy array of abundances.
abundance_my_seq_array =

# %%
# Find the abundance of A, T, C and G in my_seq. Report the answer as a dictionary of abundances.
abundance_my_seq_dict =

# %%
# Transcribe my_seq to mRNA using the method .reverse_complement()
transcript_my_seq =


# %%
# Flip 5' and 3' end of my_seq
flipped_my_seq =


# %% [markdown]
# # Troubleshooting exercise
# The following code convert a DNA sequence to its protein sequence and displays the consensus sequence. Can you identify the errors and correct it?

# %%
# Import additional libraries
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import SeqIO
import logomaker
import matplotlib.pyplot as plt
import pandas as pd

# %% [markdown]
# Perform a blast search of accession number [AY527219](https://www.ncbi.nlm.nih.gov/nuccore/AY527219). Due to the time it takes to run blast in repl, I have provided a copy of the results. Saves the blast results in `blast_results.xml`.

# %%
#result_handle = NCBIWWW.qblast("blastn", "nt", "AY527219")
#with open("blast_results.xml", "w") as out_handle:
#    out_handle.write(result_handle.read())

#result_handle.close()

# %% [markdown]
# Load `blast_results.xml`.
#
# **Spot the error!**

# %%
result_handle = open(blast_results.xml)
blast_record = NCBIXML.read(result_handle)

# %% [markdown]
# The following code snippet extract the first 120 base pairs from the matching genes.
# **Spot the error!**

# %%
# Extract first 120 bases of matching sequences.
base_pos1 = '0'
base_pos2 = '120'
matches = []
for alignment in blast_record.alignments:
    for hsp in alignment.hsps:
        matches.append(hsp.sbjct[base_pos1:base_pos2])

pd.DataFrame(matches)

# %% [markdown]
# Translate the codon to their respective amino acids.

# %%
# Convert gene to protein sequences
matches_prot = []
expected_length_primary_sequence = 5 # This is incorrect, choose the right expected length?
for sequence in matches:
    pri_seq = Seq(sequence).translate() # Translate sequence
    try:
        assert len(pri_seq) == expected_length_primary_sequence
    except:
        raise AssertionError('The protein length obtained is different from the expected length. Change accordingly')
    matches_prot.append(list(pri_seq))

# Check if
matches_df = pd.DataFrame(matches_prot)
matches_df

# %% [markdown]
# Count the number of each type of amino acid at each position.

# %%
prot_aa = []
for ind, pri_seq in matches_df.iteritems():
    aa_abundance = pri_seq.value_counts()
    prot_aa.append(aa_abundance)

prot_df = pd.DataFrame(prot_aa).fillna(0)
prot_df

# %% [markdown]
#

# %%
# Display Consensus Sequence
crp_logo = logomaker.Logo(prot_df, figsize=(10,2), color_scheme='chemistry')

# %% [markdown]
# Identify the error.

# %%
# style and show figure
crp_logo.ax.set_xlabel(Percentage)
crp_logo.ax.set_title(Primary consensus sequence)
crp_logo.fig

# %%