python pandas user-interface tkinter plotly

Interactive Plotly plots with file reader not reading correctly through button function in Python (VS Code/Jupyter)

I have a button function that uses filedialog.askopenfilename to read data from xls, xlsx, dpt, xy, and txt file types into a pandas dataframe. When I use the button and open anything other than xls or xlsx files, I get an error that the file is empty.

Here are the imports:

# GUI Imports
import tkinter as tk
from tkinter import ttk
from tkinter import filedialog
from tkinter import messagebox as mb
# Math / Plotting Imports
import numpy as np
import pandas as pd
from scipy.signal import find_peaks as fp
from scipy.signal import peak_widths as pw
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from BaselineRemoval import BaselineRemoval as BR
# Fancy plotting imports
import plotly.graph_objects as go
from plotly.subplots import make_subplots as sub
import plotly.express as px
import plotly.offline as py
import plotly.io as pio
# Other
import webbrowser
import os
import chardet
pd.options.plotting.backend='plotly'

Below is the function block:

def min_max_normalize(array):
  """Normalizes a NumPy array using min-max scaling.

  Args:
    array: A NumPy array.

  Returns:
    A normalized NumPy array.
  """
  min_val = np.min(array)
  max_val = np.max(array)
  normalized_array = (array - min_val) / (max_val - min_val)
  return normalized_array

def find_index(array, value):
    """Finds the index of the nearest value using NumPy."""
    array = np.asarray(array)
    idx = np.argmin(np.abs(array - value))
    return idx

def open_plot():
    #Actual plotting
    Upper_index = find_index(data.iloc[:,0],float(Upper_x))
    Lower_index = find_index(data.iloc[:,0],float(Lower_x))

    x = data.iloc[:,0].values
    intensity = data.iloc[:,1].values
    bg_data = BR(intensity).ZhangFit() # Removes background
    Norm_bg_data = min_max_normalize(bg_data)

    peaks,properties = fp(bg_data, prominence=20) # Detects peaks at indices
    widths, width_heights, left_ips,right_ips = pw(bg_data, peaks, rel_height=0.5) # Calculates FWHM
    filtered_peaks = peaks[(peaks>=Lower_index) & (peaks<=Upper_index)] # Removes peaks outside of window of interest

    fig = sub(rows=1,cols=3, subplot_titles=('Raw Data','Background Removed Data','Normalized Data'))
    fig.data = []
    fig.add_trace(go.Scatter(x=x[Lower_index:Upper_index],y=intensity[Lower_index:Upper_index], mode='lines'), row=1,col=1)
    fig.add_trace(go.Scatter(x=x[Lower_index:Upper_index],y=bg_data[Lower_index:Upper_index], mode='lines'), row=1,col=2)
    fig.add_trace(go.Scatter(x=x[filtered_peaks],y=bg_data[filtered_peaks], mode='markers', marker=dict(size=4,color='black',symbol='cross', line=dict(width=0.2))), row=1,col=2)
    fig.add_trace(go.Scatter(x=x[Lower_index:Upper_index],y=Norm_bg_data[Lower_index:Upper_index], mode='lines'), row=1,col=3)
    fig.update_layout(showlegend=False,autosize=True, plot_bgcolor='white', margin=dict(l=10,r=10,b=20,t=30))
    fig.update_xaxes(showline=True, linecolor='black', linewidth=2.4, ticks='outside', tickcolor='black')
    fig.update_yaxes(title_text='Intensity (a.u.)',showline=True, linecolor='black', linewidth=2.4, ticks='outside', tickcolor='black')
    # Finding Maximum peak posiiton
    peak_idx = find_index(bg_data[Lower_index:Upper_index],max(bg_data[Lower_index:Upper_index]))+Lower_index
    peak_pos = x[find_index(bg_data[Lower_index:Upper_index],max(bg_data[Lower_index:Upper_index]))+Lower_index]
    # Calculating FWHM
    Left_FWHM = x[int(left_ips[find_index(peaks,peak_idx)])]
    Right_FWHM = x[int(right_ips[find_index(peaks,peak_idx)])]
    FWHM = round(Right_FWHM-Left_FWHM,1)
    # Displaying Peak Details
    fig.add_annotation(x=peak_pos, y=max(intensity[Lower_index:Upper_index]), xref='x1', yref='y1', text='Peak: ' + str(peak_pos) + '<br>FWHM: ' + str(FWHM), arrowhead=1, yshift=5)
    fig.add_annotation(x=peak_pos, y=max(bg_data[Lower_index:Upper_index]), xref='x2', yref='y2', text='Peak: ' + str(peak_pos) + '<br>FWHM: ' + str(FWHM), arrowhead=1, yshift=5)
    fig.add_annotation(x=peak_pos, y=max(Norm_bg_data[Lower_index:Upper_index]), xref='x3', yref='y3', text='Peak: ' + str(peak_pos) + '<br>FWHM: ' + str(FWHM), arrowhead=1, yshift=5)
    pio.write_html(fig, 'plot.html')
    webbrowser.open('plot.html')
    if len(peaks)==0:
        output_label3.set('No peaks found, but here are your plots.')
        fig.add_trace(go.Scatter(x=x[Lower_index:Upper_index],y=intensity[Lower_index:Upper_index], mode='lines'), row=1,col=1)
        fig.add_trace(go.Scatter(x=x[Lower_index:Upper_index],y=bg_data[Lower_index:Upper_index], mode='lines'), row=1,col=2)
        fig.add_trace(go.Scatter(x=x[Lower_index:Upper_index],y=Norm_bg_data[Lower_index:Upper_index], mode='lines'), row=1,col=3)
        pio.write_html(fig, 'plot.html')
        webbrowser.open('plot.html')


def open_file():
    file = filedialog.askopenfilename(filetypes=[("DPT files", "*.dpt"), ("Text files", "*.txt"), ("Excel files", "*.xlsx *.xls"),("XY files", "*.xy")])
    if file:
        with open(file, 'rb') as file:
            result = chardet.detect(file.read())
            global data
            try:
                data = pd.read_csv(file, header='infer', sep='[ \t]',engine='python')
            except:
                data = pd.read_excel(file)
            output_string.set('File selected is: ' + str(file.name))
            file_section.pack()

    else:
        output_string.set('No File Selected, please choose a file.')

def get_bounds():
    global Lower_x
    global Upper_x
    Lower_x = entry1.get()
    Upper_x = entry2.get()
    output_string2.set('File chosen. Enjoy the graphs!')

Then I run the cell below to open the tkinter GUI:

# Open File Window

base = tk.Tk()
base.title('Analysis Tool')
base.geometry('1000x500')

# Hacking into the main frame
main_frame = tk.Frame(base)
main_frame.pack(pady=10)
title_label = ttk.Label(master = base, text = 'Select your data', font = 'Comic_sans 24 bold')
title_label.pack(pady=10)
open_file_button = ttk.Button(base, text = 'Choose file',command=open_file)
open_file_button.pack(pady=10)

output_string = tk.StringVar()
output_label = ttk.Label(
    base, 
    text = 'Choose file...', 
    textvariable= output_string)
output_label.pack(pady=5)

# Additional section
file_section = ttk.Frame(base)
entry1_float = tk.DoubleVar()
entry2_float = tk.DoubleVar()
advice_label = ttk.Label(file_section, text = 'Leave the fields as is for the full spectrum.')
advice_label.pack()
entry1_label = ttk.Label(file_section, text = 'Please type your desired starting x value')
entry1 = ttk.Entry(file_section, textvariable= entry1_float)
entry2_label = ttk.Label(file_section, text = 'Please type your desired ending x value')
entry2 = ttk.Entry(file_section, textvariable= entry2_float)
entry1_label.pack()
entry1.pack()
entry2_label.pack()
entry2.insert(0, '1000')
entry2.pack()
submit_bounds_button = ttk.Button(file_section, text='Submit Bounds', command=get_bounds)
submit_bounds_button.pack(pady=10)

output_string2 = tk.StringVar()
output_label2 = ttk.Label(
    file_section,
    text = '',
    textvariable=output_string2
)
output_label2.pack(pady=5)

open_plots_in_new_window = ttk.Button(file_section, text = 'Open in browser', command=open_plot)
open_plots_in_new_window.pack(pady=10)

output_string3 = tk.StringVar()
output_label3 = ttk.Label(
    file_section,
    text='',
    textvariable=output_string3
)
output_label3.pack(pady=5)
# base.after(60000, lambda: base.destroy())

warning_label = ttk.Label(file_section, text = 'CLOSE THIS WINDOW BEFORE STOPPING CODE!', font='Chiller 24 italic')
warning_label.pack(pady=5)
base.mainloop()

Then I get this error:

pandas.errors.EmptyDataError: No columns to parse from file

I made a separate block to check the pandas.read_csv and pandas.read_excel tools themselves which works as intended. So something is different about including it in the function stored in the button causing it to read any other file type as blank.

Upper_x = 10000
Lower_x = 0

file = filedialog.askopenfilename(filetypes=[("DPT files", "*.dpt"), ("Text files", "*.txt"), ("Excel files", "*.xlsx *.xls"),("XY files", "*.xy")])
try:
    data = pd.read_csv(file, header='infer', sep='[ \t]',engine='python')
except:
    data = pd.read_excel(file)
#Actual plotting
Upper_index = find_index(data.iloc[:,0],float(Upper_x))
Lower_index = find_index(data.iloc[:,0],float(Lower_x))

x = data.iloc[:,0].values
intensity = data.iloc[:,1].values
bg_data = BR(intensity).ZhangFit() # Removes background
Norm_bg_data = min_max_normalize(bg_data)

peaks,properties = fp(bg_data, prominence=20) # Detects peaks and returns indices
widths, width_heights, left_ips,right_ips = pw(bg_data, peaks, rel_height=0.5) # Calculates FWHM
filtered_peaks = peaks[(peaks>=Lower_index) & (peaks<=Upper_index)] # Removes peaks outside of window of interest

fig = sub(rows=1,cols=3, subplot_titles=('Raw Data','Background Removed Data','Normalized Data'))
fig.add_trace(go.Scatter(x=x[Lower_index:Upper_index],y=intensity[Lower_index:Upper_index], mode='lines'), row=1,col=1)
fig.add_trace(go.Scatter(x=x[Lower_index:Upper_index],y=bg_data[Lower_index:Upper_index], mode='lines'), row=1,col=2)
fig.add_trace(go.Scatter(x=x[filtered_peaks],y=bg_data[filtered_peaks], mode='markers', marker=dict(size=4,color='black',symbol='cross', line=dict(width=0.2))), row=1,col=2)
fig.add_trace(go.Scatter(x=x[Lower_index:Upper_index],y=Norm_bg_data[Lower_index:Upper_index], mode='lines'), row=1,col=3)
fig.update_layout(showlegend=False,autosize=True, plot_bgcolor='white', margin=dict(l=10,r=10,b=20,t=30))
fig.update_xaxes(showline=True, linecolor='black', linewidth=2.4, ticks='outside', tickcolor='black')
fig.update_yaxes(title_text='Intensity (a.u.)',showline=True, linecolor='black', linewidth=2.4, ticks='outside', tickcolor='black')
# Finding Maximum peak posiiton
peak_idx = find_index(bg_data[Lower_index:Upper_index],max(bg_data[Lower_index:Upper_index]))+Lower_index
peak_pos = x[find_index(bg_data[Lower_index:Upper_index],max(bg_data[Lower_index:Upper_index]))+Lower_index]
# Calculating FWHM
Left_FWHM = x[int(left_ips[find_index(peaks,peak_idx)])]
Right_FWHM = x[int(right_ips[find_index(peaks,peak_idx)])]
FWHM = round(Right_FWHM-Left_FWHM,1)
# Displaying Peak Details
fig.add_annotation(x=peak_pos, y=max(intensity[Lower_index:Upper_index]), xref='x1', yref='y1', text='Peak: ' + str(peak_pos) + '<br>FWHM: ' + str(FWHM), arrowhead=1, yshift=5)
fig.add_annotation(x=peak_pos, y=max(bg_data[Lower_index:Upper_index]), xref='x2', yref='y2', text='Peak: ' + str(peak_pos) + '<br>FWHM: ' + str(FWHM), arrowhead=1, yshift=5)
fig.add_annotation(x=peak_pos, y=max(Norm_bg_data[Lower_index:Upper_index]), xref='x3', yref='y3', text='Peak: ' + str(peak_pos) + '<br>FWHM: ' + str(FWHM), arrowhead=1, yshift=5)

fig.show()

Any help is appreciated!

Solution

In the two pieces of code you've shown, you have the following code:

data = pd.read_csv(file, header='infer', sep='[ \t]',engine='python')

However, this does something different in each case, because in one example file is a string representing a file, and in the other, it is a file-like object.

The reason why this is relevant is that a file-like object has a read position within a file, and reading from the file can change this position.

So for example this can work:

import pandas as pd


filename = 'filename.csv'
with open(filename, 'rb') as file:
#     file.read()
    print(pd.read_csv(file))

But it will fail if you uncomment file.read(). Interestingly, read_excel() does not seem to care. It seems to automatically set the read position in the file.

If you want to read from the file with file.read(), and also read from it with read_csv(), you need to reset the file position.

You could do this, for example:

import pandas as pd
import chardet


filename = 'filename.csv'
with open(filename, 'rb') as file:
    result = chardet.detect(file.read())
    print("character set:", result)
    file.seek(0)
    print(pd.read_csv(file))

Two more notes I will leave you with:

Partially, this happened because your program re-uses the variable file to store both a filename and a file-like object. If you used different variables for these, it would have been easier for you to notice that your test program is slightly different from your real program.
I think it is bad practice to wrap read_csv() in a try-except, and re-try with excel. I would suggest that you look at the extension instead, and choose which function to call that way.