I have this code for converting a DNA string into a list of codons and after that to convert this list into a string with their respective aminoacids. However, when I ran the code and the DNA string ends in a pair of nucleotides(like CT for instance) and not in a triplet, then the code doesn't generate the aminoacid sequence. As you can see in the output.
from collections import defaultdict
from collections import Counter
dna_sequence = "GAGCGTCTGCTCCGTGTATAAGCCACGTCGGAGCT"
codons = [dna_sequence[i:i+3]
for i in range (0, len(dna_sequence), 3)]
print(codons)
genetic_code = {
"GCG" :"A","GCA" :"A","GCT" :"A","GCC" :"A",
"AGG" :"R","AGA" :"R","CGG" :"R", "CGA" :"R","CGT" :"R","CGC" :"R",
"AAT" :"N","AAC" :"N",
"GAT" :"D", "GAC":"D", "TGT" :"C","TGC" :"C",
"TGA" :"*","TAG" :"*","TAA" :"*", # * Stop codon
"CAG" :"Q","CAA" :"Q",
"GAG" :"E","GAA" :"E",
"GGG" :"G","GGA" :"G","GGT" :"G","GGC" :"G",
"CAT" :"H","CAC" :"H",
"ATA" :"I","ATT" :"I","ATC" :"I",
"TTG" :"L","TTA" :"L","CTG" : "L","CTA" :"L","CTT" :"L","CTC" :"L",
"AAG" :"K","AAA" :"K",
"ATG" :"M" , # Start codon
"TTT" :"F" ,"TTC" :"F" ,
"CCG" :"P" ,"CCA" :"P" ,"CCT" :"P" ,"CCC" :"P" ,
"AGT" :"S" ,"AGC" :"S" ,"TCG" :"S" ,"TCA" :"S" ,"TCT" :"S" ,"TCC" :"S" ,
"ACG" :"T" ,"ACA" :"T" ,"ACT" :"T" ,"ACC" :"T" ,
"TGG" :"W" ,
"TAT" :"Y" ,"TAC" :"Y" ,
"GTG" :"V" ,"GTA" :"V" ,"GTT" :"V" ,"GTC" :"V"
}
def codon_seq(seq):
tmpList = []
for i in range(0, len(seq) - 2, 3):
if genetic_code [seq[i:i + 3]]:
tmpList.append(seq[i:i + 3])
print(tmpList)
def amino_seq(seq):
protein =""
if len(seq)%3 == 0:
for i in range(0, len(seq), 3):
codon = seq[i:i + 3]
protein+= genetic_code[codon]
return protein
print("Aminoacids: ")
amino_seq(dna_sequence)
OUTPUT:
Codons: ['GAG', 'CGT', 'CTG', 'CTC', 'CGT', 'GTA', 'TAA', 'GCC', 'ACG', 'TCG', 'GAG', 'CT']
Aminoacids: ''
I'd like to find a solution in order that I can use the two last nucleotides of the string to predict the next aminoacid, selecting the codons (genetic_code) that start with these two nucleotides and choosing one ramdonly. How could I do that? Any advice please.
You can define a function random_protein
to get all the codons that start with the given prefix and random.choice
one of those and use that if the len
of the current codon is != 3
.
import random
def random_protein(prefix):
codons = [c for c in genetic_code if c.startswith(prefix)]
return genetic_code[random.choice(codons)]
dna_sequence = "GAGCGTCTGCTCCGTGTATAAGCCACGTCGGAGCT"
codons = [dna_sequence[i:i+3] for i in range (0, len(dna_sequence), 3)]
proteins = [genetic_code[c] if len(c) == 3 else random_protein(c) for c in codons]