Following this post I managed to put together a small function to place within a bigger text body (FASTA) shorter strings determined from another file based on some conditions (e.g. 100 events from a subset of only those 400-to-500 characters in length, and selected randomly).
Now, I'm pretty fine with the result; however, I wish to print for those 100 events where exactly they have been added in the bigger text body — ideally, start-end position if not too hard.
I guess this could be integrated in get_retro_text()
or, if easier, build as an external function, but I cannot really figure out from where to start... any help is greatly appreciated, thanks in advance!
###library import
from Bio import SeqIO
import random
###string import and wrangling
input_file = open("homo_sapiens_strings.fasta.txt")
my_dict = SeqIO.to_dict(SeqIO.parse(input_file, "fasta"))
s = []
for j in my_dict.values():
s.append(j)
###import FASTA --> some already made function I found to import and print whole FASTA genomes but testing on a part of it
def fasta_reader(filename):
from Bio.SeqIO.FastaIO import FastaIterator
with open(filename) as handle:
for record in FastaIterator(handle):
yield record
head = ""
body = ""
for entry in fasta_reader("hg37_chr1.fna"):
head = str(entry.id)
body = str(entry.seq)
###randomly selects 100 sequences and adds them to the FASTA
def insert (source_str, insert_str, pos):
return source_str[:pos] + insert_str + source_str[pos:]
def get_retro_text(genome, all_strings):
string_of_choice = [string for string in all_strings if 400 < len(string) < 500]
hundred_strings = random.sample(string_of_choice, k=100)
text_of_strings = []
for k in range(len(hundred_strings)):
text_of_strings.append(str(hundred_strings[k].seq))
single_string = ",".join(text_of_strings)
new_genome = insert(genome, single_string, random.randint(0, len(genome)))
return new_genome
big_genome = get_retro_text(body, s)
EDIT example of structure of body
and s
body
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNtaaccctaaccctaacccta
accctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccta
accctaaccctaaccctaaccctaacccaaccctaaccctaaccctaaccctaaccctaa
ccctaacccctaaccctaaccctaaccctaaccctaacctaaccctaaccctaaccctaa
ccctaaccctaaccctaaccctaaccctaacccctaaccctaaccctaaaccctaaaccc
taaccctaaccctaaccctaaccctaaccccaaccccaaccccaaccccaaccccaaccc
caaccctaacccctaaccctaaccctaaccctaccctaaccctaaccctaaccctaaccc
taaccctaacccctaacccctaaccctaaccctaaccctaaccctaaccctaaccctaac
ccctaaccctaaccctaaccctaaccctcgCGGTACCCTCAGCCGGCCCGCCCGCCCGGG
TCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAc
aacgcagctccgccctcgcggtGCTCtccgggtctgtgctgaggagaacgCAACTCCGCC
GTTGCAAAGGCGcgccgcgccggcgcaggcgcagagaggcgcgccgcgccggcgcaggcg
cagagaggcgcgccgcgccggcgcaggcgcagagaggcgcgccgcgccggcgcaggcgca
gagaggcgcgccgcgccggcgcaggcgcagagaggcgcgccgcgccggcgcaggcgcaga
caCATGCTAGCGCGTCGGGGTGGAGGCgtggcgcaggcgcagagaggcgcgccgcgccgg
cgcaggcgcagagacaCATGCTACCGCGTCCAGGGGTGGAGGCgtggcgcaggcgcagag
aggcgcaccgcgccggcgcaggcgcagagacaCATGCTAGCGCGTCCAGGGGTGGAGGCG
TggcgcaggcgcagagacgcAAGCCTAcgggcgggggttgggggggcgTGTGTTGCAGGA
GCAAAGTCGCACGGCGCCGGGCTGGGGCGGGGGGAGGGTGGCGCCGTGCACGCGCAGAAA
CTCACGTCACGGTGGCGCGGCGCAGAGACGGGTAGAACCTCAGTAATCCGAAAAGCCGGG
ATCGACCGCCCCTTGCTTGCAGCCGGGCACTACAGGACCCGCTTGCTCACGGTGCTGTGC
CAGGGCGCCCCCTGCTGGCGACTAGGGCAACTGCAGGGCTCTCTTGCTTAGAGTGGTGGC
CAGCGCCCCCTGCTGGCGCCGGGGCACTGCAGGGCCCTCTTGCTTACTGTATAGTGGTGG
CACGCCGCCTGCTGGCAGCTAGGGACATTGCAGGGTCCTCTTGCTCAAGGTGTAGTGGCA
GCACGCCCACCTGCTGGCAGCTGGGGACACTGCCGGGCCCTCTTGCTCCAACAGTACTGG
CGGATTATAGGGAAACACCCGGAGCATATGCTGTTTGGTCTCAGTAGACTCCTAAATATG
GGATTCCTgggtttaaaagtaaaaaataaatatgtttaatttgtGAACTGATTACCATCA
GAATTGTACTGTTCTGTATCCCACCAGCAATGTCTAGGAATGCCTGTTTCTCCACAAAGT
GTTtacttttggatttttgccagTCTAACAGGTGAAGCCCTGGAGATTCTTATTAGTGAT
TTGGGCTGGGGCCTGgccatgtgtatttttttaaatttccactgaTGATTTTGCTGCATG
GCCGGTGTTGAGAATGACTGCGCAAATTTGCCGGATTTCCTTTGCTGTTCCTGCATGTAG
TTTAAACGAGATTGCCAGCACCGGGTATCATTCACCATTTTTCTTTTCGTTAACTTGCCG
TCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGTGTATTTGCTGTCTCTTAGCCCAGA
CTTCCCGTGTCCTTTCCACCGGGCCTTTGAGAGGTCACAGGGTCTTGATGCTGTGGTCTT
CATCTGCAGGTGTCTGACTTCCAGCAACTGCTGGCCTGTGCCAGGGTGCAAGCTGAGCAC
TGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAGTGGGATGGGCCATTGTTCAT
s
[[SeqRecord(seq=Seq('ATGGCGGGACACCCGAAAGAGAGGGTGGTCACAGATGAGGTCCATCAGAACCAG...TAG'), id='retro_hsap_1', name='retro_hsap_1', description='retro_hsap_1', dbxrefs=[]), SeqRecord(seq=Seq('ATGGTCAACGTACCTAAAACCCGAAGAACCTTCTGTAAGAAGTGTGGCAAGCAT...TAA'), id='retro_hsap_2', name='retro_hsap_2', description='retro_hsap_2', dbxrefs=[]), SeqRecord(seq=Seq('ATGTCCACAATGGGAAACGAGGCCAGTTACCCGGCGGAGATGTGCTCCCACTTT...TGA'), id='retro_hsap_3', name='retro_hsap_3', description='retro_hsap_3', dbxrefs=[])]]
Your current code has some issues:
It inserts the 100 randomly selected strings all adjacent to eachother in the genome
The 100 strings are concatenated with commas, which end up in the final gnome string
So that would need to be fixed first before getting to the question of the positions where the insertions happen.
I would tackle these issues as follows. I have placed comments where I changed the code
import random
# removed the insert function: will not be used
def get_retro_text(genome, all_strings):
string_of_choice = [string for string in all_strings if 400 < len(string) < 500]
hundred_strings = random.sample(string_of_choice, k=100)
# get a sorted list of randomly selected insertion points in the genome
indices = sorted(random.randrange(len(genome)) for _ in hundred_strings)
# get a list sub strings that result from slicing up the genome at the random insertion points
slices = [genome[i:j] for i, j in zip([0] + indices, indices + [len(genome)])]
# determine what the offsets will be once the selected strings are inserted into the genome
start = 0
offsets = [(start := start + len(infix), start := start + len(string)) for infix, string in zip(slices, hundred_strings)]
# finally build the new genome by alternating the slices (substrings) with the strings to insert
new_genome = "".join("".join(pairs) for pairs in zip(slices, hundred_strings)) + slices[-1]
# return both the new genome and the information of where insertions were made
return new_genome, offsets
# if you have items with a seq attribute, then extract that first:
s = [str(item.seq) for item in s]
# get both the genome and the information about the insertion points
big_genome, offsets = get_retro_text(body, s)
print(big_genome)
print(offsets)