pythonstringbioinformaticsbiopythonfasta

determine position of an inserted string within another


Following this post I managed to put together a small function to place within a bigger text body (FASTA) shorter strings determined from another file based on some conditions (e.g. 100 events from a subset of only those 400-to-500 characters in length, and selected randomly).

Now, I'm pretty fine with the result; however, I wish to print for those 100 events where exactly they have been added in the bigger text body — ideally, start-end position if not too hard.

I guess this could be integrated in get_retro_text() or, if easier, build as an external function, but I cannot really figure out from where to start... any help is greatly appreciated, thanks in advance!

###library import
from Bio import SeqIO
import random

###string import and wrangling
input_file = open("homo_sapiens_strings.fasta.txt")
my_dict = SeqIO.to_dict(SeqIO.parse(input_file, "fasta"))

s = []
for j in my_dict.values():
   s.append(j)

###import FASTA --> some already made function I found to import and print whole FASTA genomes but testing on a part of it
def fasta_reader(filename):
  from Bio.SeqIO.FastaIO import FastaIterator
  with open(filename) as handle:
    for record in FastaIterator(handle):
      yield record

head = ""
body = ""
for entry in fasta_reader("hg37_chr1.fna"):
  head = str(entry.id)
  body = str(entry.seq)

###randomly selects 100 sequences and adds them to the FASTA
def insert (source_str, insert_str, pos):
    return source_str[:pos] + insert_str + source_str[pos:]

def get_retro_text(genome, all_strings):
    string_of_choice = [string for string in all_strings if 400 < len(string) < 500]
    hundred_strings = random.sample(string_of_choice, k=100)

    text_of_strings = []
    for k in range(len(hundred_strings)):
        text_of_strings.append(str(hundred_strings[k].seq))

    single_string = ",".join(text_of_strings)
    new_genome = insert(genome, single_string, random.randint(0, len(genome)))
    
    return new_genome

big_genome = get_retro_text(body, s)

EDIT example of structure of body and s

body


NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNtaaccctaaccctaacccta
accctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccta
accctaaccctaaccctaaccctaacccaaccctaaccctaaccctaaccctaaccctaa
ccctaacccctaaccctaaccctaaccctaaccctaacctaaccctaaccctaaccctaa
ccctaaccctaaccctaaccctaaccctaacccctaaccctaaccctaaaccctaaaccc
taaccctaaccctaaccctaaccctaaccccaaccccaaccccaaccccaaccccaaccc
caaccctaacccctaaccctaaccctaaccctaccctaaccctaaccctaaccctaaccc
taaccctaacccctaacccctaaccctaaccctaaccctaaccctaaccctaaccctaac
ccctaaccctaaccctaaccctaaccctcgCGGTACCCTCAGCCGGCCCGCCCGCCCGGG
TCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAc
aacgcagctccgccctcgcggtGCTCtccgggtctgtgctgaggagaacgCAACTCCGCC
GTTGCAAAGGCGcgccgcgccggcgcaggcgcagagaggcgcgccgcgccggcgcaggcg
cagagaggcgcgccgcgccggcgcaggcgcagagaggcgcgccgcgccggcgcaggcgca
gagaggcgcgccgcgccggcgcaggcgcagagaggcgcgccgcgccggcgcaggcgcaga
caCATGCTAGCGCGTCGGGGTGGAGGCgtggcgcaggcgcagagaggcgcgccgcgccgg
cgcaggcgcagagacaCATGCTACCGCGTCCAGGGGTGGAGGCgtggcgcaggcgcagag
aggcgcaccgcgccggcgcaggcgcagagacaCATGCTAGCGCGTCCAGGGGTGGAGGCG
TggcgcaggcgcagagacgcAAGCCTAcgggcgggggttgggggggcgTGTGTTGCAGGA
GCAAAGTCGCACGGCGCCGGGCTGGGGCGGGGGGAGGGTGGCGCCGTGCACGCGCAGAAA
CTCACGTCACGGTGGCGCGGCGCAGAGACGGGTAGAACCTCAGTAATCCGAAAAGCCGGG
ATCGACCGCCCCTTGCTTGCAGCCGGGCACTACAGGACCCGCTTGCTCACGGTGCTGTGC
CAGGGCGCCCCCTGCTGGCGACTAGGGCAACTGCAGGGCTCTCTTGCTTAGAGTGGTGGC
CAGCGCCCCCTGCTGGCGCCGGGGCACTGCAGGGCCCTCTTGCTTACTGTATAGTGGTGG
CACGCCGCCTGCTGGCAGCTAGGGACATTGCAGGGTCCTCTTGCTCAAGGTGTAGTGGCA
GCACGCCCACCTGCTGGCAGCTGGGGACACTGCCGGGCCCTCTTGCTCCAACAGTACTGG
CGGATTATAGGGAAACACCCGGAGCATATGCTGTTTGGTCTCAGTAGACTCCTAAATATG
GGATTCCTgggtttaaaagtaaaaaataaatatgtttaatttgtGAACTGATTACCATCA
GAATTGTACTGTTCTGTATCCCACCAGCAATGTCTAGGAATGCCTGTTTCTCCACAAAGT
GTTtacttttggatttttgccagTCTAACAGGTGAAGCCCTGGAGATTCTTATTAGTGAT
TTGGGCTGGGGCCTGgccatgtgtatttttttaaatttccactgaTGATTTTGCTGCATG
GCCGGTGTTGAGAATGACTGCGCAAATTTGCCGGATTTCCTTTGCTGTTCCTGCATGTAG
TTTAAACGAGATTGCCAGCACCGGGTATCATTCACCATTTTTCTTTTCGTTAACTTGCCG
TCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGTGTATTTGCTGTCTCTTAGCCCAGA
CTTCCCGTGTCCTTTCCACCGGGCCTTTGAGAGGTCACAGGGTCTTGATGCTGTGGTCTT
CATCTGCAGGTGTCTGACTTCCAGCAACTGCTGGCCTGTGCCAGGGTGCAAGCTGAGCAC
TGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAGTGGGATGGGCCATTGTTCAT

s

[[SeqRecord(seq=Seq('ATGGCGGGACACCCGAAAGAGAGGGTGGTCACAGATGAGGTCCATCAGAACCAG...TAG'), id='retro_hsap_1', name='retro_hsap_1', description='retro_hsap_1', dbxrefs=[]), SeqRecord(seq=Seq('ATGGTCAACGTACCTAAAACCCGAAGAACCTTCTGTAAGAAGTGTGGCAAGCAT...TAA'), id='retro_hsap_2', name='retro_hsap_2', description='retro_hsap_2', dbxrefs=[]), SeqRecord(seq=Seq('ATGTCCACAATGGGAAACGAGGCCAGTTACCCGGCGGAGATGTGCTCCCACTTT...TGA'), id='retro_hsap_3', name='retro_hsap_3', description='retro_hsap_3', dbxrefs=[])]]

Solution

  • Your current code has some issues:

    So that would need to be fixed first before getting to the question of the positions where the insertions happen.

    I would tackle these issues as follows. I have placed comments where I changed the code

    import random
    
    # removed the insert function: will not be used
    
    def get_retro_text(genome, all_strings):
        string_of_choice = [string for string in all_strings if 400 < len(string) < 500]
        hundred_strings = random.sample(string_of_choice, k=100)
        # get a sorted list of randomly selected insertion points in the genome    
        indices = sorted(random.randrange(len(genome)) for _ in hundred_strings)
        # get a list sub strings that result from slicing up the genome at the random insertion points   
        slices = [genome[i:j] for i, j in zip([0] + indices, indices + [len(genome)])]
        
        # determine what the offsets will be once the selected strings are inserted into the genome   
        start = 0
        offsets = [(start := start + len(infix), start := start + len(string)) for infix, string in zip(slices, hundred_strings)]
        # finally build the new genome by alternating the slices (substrings) with the strings to insert
        new_genome = "".join("".join(pairs) for pairs in zip(slices, hundred_strings)) + slices[-1]
        # return both the new genome and the information of where insertions were made
        return new_genome, offsets
    
    
    # if you have items with a seq attribute, then extract that first:
    s = [str(item.seq) for item in s]
    # get both the genome and the information about the insertion points
    big_genome, offsets = get_retro_text(body, s)
    
    print(big_genome)
    print(offsets)