pythonpandasdataframepython-resepa

Same entry, although only found in one column


from sepa import parser
import re
import csv
import pandas as pd
import numpy as np


# Utility function to remove additional namespaces from the XML
def strip_namespace(xml):
    return re.sub(' xmlns="[^"]+"', '', xml, count=1)


# Read file
with open('test.xml', 'r') as f:
    input_data = f.read()

# Parse the bank statement XML to dictionary
camt_dict = parser.parse_string(parser.bank_to_customer_statement, bytes(strip_namespace(input_data), 'utf8'))

statements = pd.DataFrame.from_dict(camt_dict['statements'])
all_entries = []

for i, _ in statements.iterrows():
    if 'entries' in camt_dict['statements'][i]:
        df = pd.DataFrame()
        dd = pd.DataFrame.from_records(camt_dict['statements'][i]['entries'])
        dg = dd['entry_details']
        df['Date'] = dd['value_date'].str['date']
        df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%d-%m-%Y')
        iban = camt_dict['statements'][i]['account']['id']['iban']
        df['IBAN'] = iban
        df['Currency'] = dd['amount'].str['currency']

        # Sort Credit/Debit in separate Columns
        df['Credit'] = np.where(dd['credit_debit_indicator'] == 'CRDT', dd['amount'].str['_value'], '')
        df['Debit'] = np.where(dd['credit_debit_indicator'] == 'DBIT', dd['amount'].str['_value'], '')

        # Get destination IBAN
        getlength = len(dg.index) #2
        
        for i in range(0, getlength):
            result = str(dd['entry_details'][i])
            print(result + "Resultat " + str(i))
            search_for_iban = re.search("CH\d{2}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{1}|CH\d{19}", result)
            if(search_for_iban is None):
                print('the search is none')
                df['Test'] = 'None'
            else:
                print('the search is a match')
                df['Test'] = 'Yes'

        all_entries.append(df)


df_entries = pd.concat(all_entries)
print(df_entries)

**My problem here is just with this code block **

for i in range(0, getlength):
            result = str(dd['entry_details'][i])
            search_for_iban = re.search("CH\d{2}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{4}[ ]\d{1}|CH\d{19}", result)
            
            if(search_for_iban is None):
                df['Test'] = 'None'
            else:
                df['Test'] = search_for_iban.group()

        all_entries.append(df)

I have already tried to solve various things via the index, this also counts cleanly high in the variable i and the getlength is also correct for 2 entries

What im expecting If there is an IBAN number in the 'search_for_iban' (which is using regex lookup (re.search)) which is matching in 2nd row i want that iban just in 2nd row (dataframe) "Test" as follows:

what i expect

What im getting I got double the entry in row 1 and 2 although none was found in row 1. What am i overlooking, my head is hurting! :D

what i got

i think i am making a thinking error here between normal for loop and panda entries


Solution

  • You can try:

    for i in range(0, getlength):
           .
           .
           .
           else:
              df.loc[i, 'Test'] = search_for_iban