pythonrsequencefasta

How to filter "productive" amino acid sequences


I have a fasta file with different amino acid sequences, for

example : example.fasta :

>abc
HSTSDSAQTMFPVALLLLAAGSCVKGEQLTQPTSVTVQPGQRLTITCQVSYSLGTYFTAW
IRQPAGKGLEWIGMRSTGASYYKDSLKNKFSIDLDTSSKTVTLNGQNVQPEDTAVYYCAR
APSRGFDYWGKGTMVTITSATPKGPTVFPL

>def
TARQIQHKPCFL*LCCCWQLDHV*RVNS*HSRPL*LCSQVNV*PSPVRSLILLVPTSQLG
SDSLQEKDWSGLE*DLLELHTTKIH*RTSSVST*TLPAKL*L*MDRMCSLKTLLCITVPE
RPVGVLTTGGKAPWSPSPRPPQRDQLCFL*

>ghi
GSQHVRFSTNHVSCSSAAVGSWIMCEG*TVDTADLCDCAARSTSDHHLSGLLFSW*LLHS
LDQTACRKRTGVDWEQIYWSCILQRFIKEQVQYRLRHFQQNCDSKWTECAA*RHCCVLLC
QTTGSGSWLLGERHHGHHHLGHPKGTNCVSS

and I want to filter out the sequences that are "productive" from the "non-productive" ones.

Additional info: I had translated every DNA sequence to amino acid sequence in all 6 frames.

By "non-productive" I mean those that don't translate into proteins (don't have the amino acid M and/or have too many stop codons). I would like to filter out these non-productive sequences in a fasta file.

As for the "productive" ones, I would also like to save every "productive" sequence only with the complete frame in another fasta file.


Solution

  • An example using biopython and a threshold on the number of stop codons.

    # pip install biopython
    from Bio import SeqIO
    
    seqs = SeqIO.parse(open('example.fasta'), 'fasta')
    productive = {}
    for s in seqs:
        productive.setdefault(s.count('*')<3, []).append(s)
    
    print(productive)
    

    Output:

    {True:  [SeqRecord(seq=Seq('HSTSDSAQTMFPVALLLLAAGSCVKGEQLTQPTSVTVQPGQRLTITCQVSYSLG...FPL'), id='abc', name='abc', description='abc', dbxrefs=[])],
     False: [SeqRecord(seq=Seq('TARQIQHKPCFL*LCCCWQLDHV*RVNS*HSRPL*LCSQVNV*PSPVRSLILLV...FL*'), id='def', name='def', description='def', dbxrefs=[]),
             SeqRecord(seq=Seq('GSQHVRFSTNHVSCSSAAVGSWIMCEG*TVDTADLCDCAARSTSDHHLSGLLFS...VSS'), id='ghi', name='ghi', description='ghi', dbxrefs=[])]}
    

    You can add a mode complex logic by replacing s.count('*')<3 by a custom function:

    from Bio import SeqIO
    
    seqs = SeqIO.parse(open('test/stackoverflow/example.fasta'), 'fasta')
    
    def is_productive(s) -> bool:
        # does the sequence start with M and contain less than 3 stops?
        return s.seq.startswith('M') and (s.count('*')<3)
    
    productive = {}
    for s in seqs:
        productive.setdefault(is_productive(s), []).append(s)
    

    writing as fasta:

    with open('productive_seqs.fasta', 'w') as fw:
        SeqIO.write(productive.get(True, []), fw, 'fasta')
    with open('nonproductive_seqs.fasta', 'w') as fw:
        SeqIO.write(productive.get(False, []), fw, 'fasta')
    

    Output:

    # productive_seqs.fasta
    >abc
    HSTSDSAQTMFPVALLLLAAGSCVKGEQLTQPTSVTVQPGQRLTITCQVSYSLGTYFTAW
    IRQPAGKGLEWIGMRSTGASYYKDSLKNKFSIDLDTSSKTVTLNGQNVQPEDTAVYYCAR
    APSRGFDYWGKGTMVTITSATPKGPTVFPL
    
    # nonproductive_seqs.fasta
    >def
    TARQIQHKPCFL*LCCCWQLDHV*RVNS*HSRPL*LCSQVNV*PSPVRSLILLVPTSQLG
    SDSLQEKDWSGLE*DLLELHTTKIH*RTSSVST*TLPAKL*L*MDRMCSLKTLLCITVPE
    RPVGVLTTGGKAPWSPSPRPPQRDQLCFL*
    >ghi
    GSQHVRFSTNHVSCSSAAVGSWIMCEG*TVDTADLCDCAARSTSDHHLSGLLFSW*LLHS
    LDQTACRKRTGVDWEQIYWSCILQRFIKEQVQYRLRHFQQNCDSKWTECAA*RHCCVLLC
    QTTGSGSWLLGERHHGHHHLGHPKGTNCVSS
    

    Note that if you only need the files, you can directly write the sequence in the loop:

    from Bio import SeqIO
    
    seqs = SeqIO.parse(open('test/stackoverflow/example.fasta'), 'fasta')
    
    def is_productive(s):
        return s.seq.startswith('M') and (s.count('*')<3)
    
    with open('productive_seqs.fasta', 'w') as fw1, open('nonproductive_seqs.fasta', 'w') as fw2:
        for s in seqs:
            SeqIO.write(s, fw1 if is_productive(s) else fw2, 'fasta')