python string bioinformatics biopython fasta

determine position of an inserted string within another

Following this post I managed to put together a small function to place within a bigger text body (FASTA) shorter strings determined from another file based on some conditions (e.g. 100 events from a subset of only those 400-to-500 characters in length, and selected randomly).

Now, I'm pretty fine with the result; however, I wish to print for those 100 events where exactly they have been added in the bigger text body — ideally, start-end position if not too hard.

I guess this could be integrated in get_retro_text() or, if easier, build as an external function, but I cannot really figure out from where to start... any help is greatly appreciated, thanks in advance!

###library import
from Bio import SeqIO
import random

###string import and wrangling
input_file = open("homo_sapiens_strings.fasta.txt")
my_dict = SeqIO.to_dict(SeqIO.parse(input_file, "fasta"))

s = []
for j in my_dict.values():
   s.append(j)

###import FASTA --> some already made function I found to import and print whole FASTA genomes but testing on a part of it
def fasta_reader(filename):
  from Bio.SeqIO.FastaIO import FastaIterator
  with open(filename) as handle:
    for record in FastaIterator(handle):
      yield record

head = ""
body = ""
for entry in fasta_reader("hg37_chr1.fna"):
  head = str(entry.id)
  body = str(entry.seq)

###randomly selects 100 sequences and adds them to the FASTA
def insert (source_str, insert_str, pos):
    return source_str[:pos] + insert_str + source_str[pos:]

def get_retro_text(genome, all_strings):
    string_of_choice = [string for string in all_strings if 400 < len(string) < 500]
    hundred_strings = random.sample(string_of_choice, k=100)

    text_of_strings = []
    for k in range(len(hundred_strings)):
        text_of_strings.append(str(hundred_strings[k].seq))

    single_string = ",".join(text_of_strings)
    new_genome = insert(genome, single_string, random.randint(0, len(genome)))
    
    return new_genome

big_genome = get_retro_text(body, s)

EDIT example of structure of body and s

body


NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNtaaccctaaccctaacccta
accctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccta
accctaaccctaaccctaaccctaacccaaccctaaccctaaccctaaccctaaccctaa
ccctaacccctaaccctaaccctaaccctaaccctaacctaaccctaaccctaaccctaa
ccctaaccctaaccctaaccctaaccctaacccctaaccctaaccctaaaccctaaaccc
taaccctaaccctaaccctaaccctaaccccaaccccaaccccaaccccaaccccaaccc
caaccctaacccctaaccctaaccctaaccctaccctaaccctaaccctaaccctaaccc
taaccctaacccctaacccctaaccctaaccctaaccctaaccctaaccctaaccctaac
ccctaaccctaaccctaaccctaaccctcgCGGTACCCTCAGCCGGCCCGCCCGCCCGGG
TCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAc
aacgcagctccgccctcgcggtGCTCtccgggtctgtgctgaggagaacgCAACTCCGCC
GTTGCAAAGGCGcgccgcgccggcgcaggcgcagagaggcgcgccgcgccggcgcaggcg
cagagaggcgcgccgcgccggcgcaggcgcagagaggcgcgccgcgccggcgcaggcgca
gagaggcgcgccgcgccggcgcaggcgcagagaggcgcgccgcgccggcgcaggcgcaga
caCATGCTAGCGCGTCGGGGTGGAGGCgtggcgcaggcgcagagaggcgcgccgcgccgg
cgcaggcgcagagacaCATGCTACCGCGTCCAGGGGTGGAGGCgtggcgcaggcgcagag
aggcgcaccgcgccggcgcaggcgcagagacaCATGCTAGCGCGTCCAGGGGTGGAGGCG
TggcgcaggcgcagagacgcAAGCCTAcgggcgggggttgggggggcgTGTGTTGCAGGA
GCAAAGTCGCACGGCGCCGGGCTGGGGCGGGGGGAGGGTGGCGCCGTGCACGCGCAGAAA
CTCACGTCACGGTGGCGCGGCGCAGAGACGGGTAGAACCTCAGTAATCCGAAAAGCCGGG
ATCGACCGCCCCTTGCTTGCAGCCGGGCACTACAGGACCCGCTTGCTCACGGTGCTGTGC
CAGGGCGCCCCCTGCTGGCGACTAGGGCAACTGCAGGGCTCTCTTGCTTAGAGTGGTGGC
CAGCGCCCCCTGCTGGCGCCGGGGCACTGCAGGGCCCTCTTGCTTACTGTATAGTGGTGG
CACGCCGCCTGCTGGCAGCTAGGGACATTGCAGGGTCCTCTTGCTCAAGGTGTAGTGGCA
GCACGCCCACCTGCTGGCAGCTGGGGACACTGCCGGGCCCTCTTGCTCCAACAGTACTGG
CGGATTATAGGGAAACACCCGGAGCATATGCTGTTTGGTCTCAGTAGACTCCTAAATATG
GGATTCCTgggtttaaaagtaaaaaataaatatgtttaatttgtGAACTGATTACCATCA
GAATTGTACTGTTCTGTATCCCACCAGCAATGTCTAGGAATGCCTGTTTCTCCACAAAGT
GTTtacttttggatttttgccagTCTAACAGGTGAAGCCCTGGAGATTCTTATTAGTGAT
TTGGGCTGGGGCCTGgccatgtgtatttttttaaatttccactgaTGATTTTGCTGCATG
GCCGGTGTTGAGAATGACTGCGCAAATTTGCCGGATTTCCTTTGCTGTTCCTGCATGTAG
TTTAAACGAGATTGCCAGCACCGGGTATCATTCACCATTTTTCTTTTCGTTAACTTGCCG
TCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGTGTATTTGCTGTCTCTTAGCCCAGA
CTTCCCGTGTCCTTTCCACCGGGCCTTTGAGAGGTCACAGGGTCTTGATGCTGTGGTCTT
CATCTGCAGGTGTCTGACTTCCAGCAACTGCTGGCCTGTGCCAGGGTGCAAGCTGAGCAC
TGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAGTGGGATGGGCCATTGTTCAT

s

[[SeqRecord(seq=Seq('ATGGCGGGACACCCGAAAGAGAGGGTGGTCACAGATGAGGTCCATCAGAACCAG...TAG'), id='retro_hsap_1', name='retro_hsap_1', description='retro_hsap_1', dbxrefs=[]), SeqRecord(seq=Seq('ATGGTCAACGTACCTAAAACCCGAAGAACCTTCTGTAAGAAGTGTGGCAAGCAT...TAA'), id='retro_hsap_2', name='retro_hsap_2', description='retro_hsap_2', dbxrefs=[]), SeqRecord(seq=Seq('ATGTCCACAATGGGAAACGAGGCCAGTTACCCGGCGGAGATGTGCTCCCACTTT...TGA'), id='retro_hsap_3', name='retro_hsap_3', description='retro_hsap_3', dbxrefs=[])]]

Solution

Your current code has some issues:

It inserts the 100 randomly selected strings all adjacent to eachother in the genome
The 100 strings are concatenated with commas, which end up in the final gnome string

So that would need to be fixed first before getting to the question of the positions where the insertions happen.

I would tackle these issues as follows. I have placed comments where I changed the code

import random

# removed the insert function: will not be used

def get_retro_text(genome, all_strings):
    string_of_choice = [string for string in all_strings if 400 < len(string) < 500]
    hundred_strings = random.sample(string_of_choice, k=100)
    # get a sorted list of randomly selected insertion points in the genome    
    indices = sorted(random.randrange(len(genome)) for _ in hundred_strings)
    # get a list sub strings that result from slicing up the genome at the random insertion points   
    slices = [genome[i:j] for i, j in zip([0] + indices, indices + [len(genome)])]
    
    # determine what the offsets will be once the selected strings are inserted into the genome   
    start = 0
    offsets = [(start := start + len(infix), start := start + len(string)) for infix, string in zip(slices, hundred_strings)]
    # finally build the new genome by alternating the slices (substrings) with the strings to insert
    new_genome = "".join("".join(pairs) for pairs in zip(slices, hundred_strings)) + slices[-1]
    # return both the new genome and the information of where insertions were made
    return new_genome, offsets


# if you have items with a seq attribute, then extract that first:
s = [str(item.seq) for item in s]
# get both the genome and the information about the insertion points
big_genome, offsets = get_retro_text(body, s)

print(big_genome)
print(offsets)