pythonbioinformaticsxlsxwriterbiopython

Writing Python output as xlsx


I want to execute same function (gives output A, B, C, D) for all the files (library) available in the given path. I am trying write the output (A,B,C,D) in four different columns of a sheet in xlsx. Also, the sheet name of xlsx should be same as the respected file available in the path.

I have written the following code:

def create_xlsx_file(xlsx_name, file_path): 
    
    workbook = xlsxwriter.Workbook(xlsx_name) ### creates a xlsx file
    workbook.close()
    libraries=os.listdir(file_path)
    
    file_path=os.chdir(file_path)
    for library in libraries: ### to create the sheets named same as the library 
        # print(library)
        if library.endswith('.txt'):
            # library=file_path+library
            # print(library)
            main(library, xlsx_name) 

def main(library, xlsx_name): ###library = all files in the given path
    directory=os.chdir(os.getcwd())
    workbook = openpyxl.load_workbook(xlsx_name)
    worksheet = workbook.create_sheet(library, 0)##### creates workshhets named same as library name
    #print('library is: - ',library)
    sheet=workbook[library] ###to create column headers
    sheet.cell(column=1, row=1, value='value_A')
    sheet.cell(column=2, row=1, value='value_B')
    sheet.cell(column=3, row=1, value='value_C')
    sheet.cell(column=4, row=1, value='value_D')
    workbook.save(xlsx_name)
    with open(library, 'r') as library:
        for line in library:

            A=line.split(' ')[0]
            B=line.split(' ')[1]
            C=line.split(' ')[2]
            D=line.split(' ')[3]

            sheet=workbook[library]
            sheet.cell(column=1, row=sheet.max_row+1, value=str(A))
            sheet.cell(column=2, row=sheet.max_row, value=str(B))
            sheet.cell(column=3, row=sheet.max_row, value=str(C))
            sheet.cell(column=4, row=sheet.max_row, value=str(D))
            
    print(f'library  {library} has been written at {os.getcwd()}')
    #time.sleep(1)
    workbook.save(xlsx_name)

This code works absolutely fine for me but it is too slow to write xlsx file as my path has hundreds of .txt libraries and each library have more than millions of lines.

I could save the output(A,B,C,D) as .txt format and then can write xlsx file manually but it is very laboursome.

Is there any way to fasten this process? or any other fast xlsx writer is available? Any help will be appreciated. Thanks


Solution

  • I have found a faster way to save my data into excel is : Since I have output as a result of a for loop, first save the output(A,B,C,D) into a dictionary and then save into excel using pandas.

    def create_xlsx_file(xlsx_name, file_path):
    
    workbook = xlsxwriter.Workbook(xlsx_name) ### creates a xlsx file
    workbook.close()
    libraries=os.listdir(file_path)
    
    file_path=os.chdir(file_path)
    for library in libraries: ### to create the sheets named same as the library 
        # print(library)
        if library.endswith('.txt'):
            # library=file_path+library
            # print(library)
            main(library, xlsx_name) 
    
    def main(library, xlsx_name): ###library = all files in the given path
        dic={'label_A':[], 'label_B':[],'label_C':[],'label_D':[]}# to store A,B,C,D values.
        directory=os.chdir(os.getcwd())
        workbook = openpyxl.load_workbook(xlsx_name)
        worksheet = workbook.create_sheet(library, 0)##### creates workshhets named same as library name
        #print('library is: - ',library)
        sheet=workbook[library] ###to create column headers
        sheet.cell(column=1, row=1, value='value_A')
        sheet.cell(column=2, row=1, value='value_B')
        sheet.cell(column=3, row=1, value='value_C')
        sheet.cell(column=4, row=1, value='value_D')
        workbook.save(xlsx_name)
        with open(library, 'r') as library:
           for line in library:
    
            A=line.split(' ')[0]
            B=line.split(' ')[1]
            C=line.split(' ')[2]
            D=line.split(' ')[3]
    
            dic['label_A'].append(A)
            dic['label_B'].append(B)
            dic['label_C'].append(C)
            dic['label_D'].append(D)
            
        df=pd.DataFrame(data=dic, columns=['label_A', 'label_B', 'label_C', 'label_D'])
        df.to_excel(xlsx_name, sheet_name=library)
        print(f'library  {library} has been written at {os.getcwd()}')
        #time.sleep(1)
        workbook.save(xlsx_name)