pythonnumpyscipyintegrationarea

Area under the curve for a dataset and 2 sets of baseline


Do not know if this problem is posted somewhere else, please direct me to it if that is the case, thanks

I have some data that looks like this enter image description here

I want to calculate the area under the curve, however as the initial and end point do not go to zero I would like to make a linear regression based on the start and end data and then calculate the area under the data but confined within the linear regressions in something like this

enter image description here

I want to choose the the data for the linear regression and then calculate the area under the data based on the last point in the regression for the pre-baseline and the first point in the regression for the post-baseline that is confined within the linear regression.

I have played a bit with chatGPT to provide me some code that than do what I want but it does not seem to understand that I only want the area between the linear regressions and data

def baseline_correction(data, indices):
    # Calculate a unified baseline using linear regression on selected indices
    slope, intercept, _, _, _ = linregress(data[indices, 0], data[indices, 1])
    return slope, intercept

def calculate_area(data, start_index, end_index, slope, intercept):
    # Correct the current by subtracting the baseline
    corrected_current = data[start_index:end_index, 1] - (data[start_index:end_index, 0] * slope + intercept)
    # Calculate the area (charge) under the corrected current curve
    charge = trapz(corrected_current, x=data[start_index:end_index, 0])
    return charge, corrected_current

def process_cycles(data, cycle_starts):
    charges = []
    for i in range(len(cycle_starts) - 1):
        start_index = cycle_starts[i]
        end_index = cycle_starts[i + 1]

        # Selecting only the increasing potential part of the cycle
        increasing_potential_indices = np.where(np.diff(data[start_index:end_index, 0]) > 0)[0] + start_index
        
        # Combine pre-peak and post-peak data points for baseline
        pre_peak_indices = increasing_potential_indices[np.where((data[increasing_potential_indices, 0] >= 0.35) & (data[increasing_potential_indices, 0] <= 0.4))]
        post_peak_indices = increasing_potential_indices[np.where((data[increasing_potential_indices, 0] >= 0.525) & (data[increasing_potential_indices, 0] <= 0.55))]
        combined_indices = np.concatenate((pre_peak_indices, post_peak_indices))
        
        # Calculate unified baseline
        slope, intercept = baseline_correction(data, combined_indices)
        
        # Define peak region (between 0.35V to 0.4V)
        peak_start_idx = np.min(np.where(data[:, 0] >= 0.4)[0])
        peak_end_idx = np.max(np.where(data[:, 0] <= 0.525)[0])
        
        # Calculate the charge
        charge, corrected_current = calculate_area(data, peak_start_idx, peak_end_idx, slope, intercept)
        charges.append(charge)
        
        # Visualization
        plt.figure(figsize=(10, 6))
        plt.plot(data[start_index:end_index, 0], data[start_index:end_index, 1], label='Original Data', color='gray')
        plt.plot(data[combined_indices, 0], data[combined_indices, 1], 'o', label='Baseline Points', color='red')
        baseline_curve = data[peak_start_idx:peak_end_idx, 0] * slope + intercept
        plt.plot(data[peak_start_idx:peak_end_idx, 0], baseline_curve, label='Baseline', color='green')
        plt.plot(data[peak_start_idx:peak_end_idx, 0], corrected_current, label='Corrected Current', color='purple')
        plt.fill_between(data[peak_start_idx:peak_end_idx, 0], 0, corrected_current, color='purple', alpha=0.3, label='Area Under Curve')
        plt.xlabel('Potential (V)')
        plt.ylabel('Current (A)')
        plt.title(f'Cycle {i+1} Charge Calculation')
        plt.legend()
        plt.grid(True)
        plt.show()
    
    return charges

Edit:

data available from Wetransfer:

https://wetransfer.com/downloads/18be5ac96b80858ee5e0f24efab826b120240503140809/d7ba45

the following code is used for data processing:

import numpy as np
import matplotlib.pyplot as plt

def load_numeric_data(filepath):
    data_start_marker = "Potential/V, Current/A"
    numeric_data = []
    
    with open(filepath, 'r') as file:
        # Read through the file until the marker is found
        for line in file:
            if data_start_marker in line:
                break
        
        # Read the remaining lines and process them
        for line in file:
            parts = line.strip().split(',')
            if len(parts) == 2:
                try:
                    potential = float(parts[0].strip())
                    current = float(parts[1].strip())
                    numeric_data.append([potential, current])
                except ValueError:
                    # This handles lines that cannot be converted to floats
                    continue

    # Convert list to a NumPy array
    return np.array(numeric_data)

def find_cycle_starts(data, start_potential):
    # Finding all indices where the potential approximately matches the start_potential
    potential_close_indices = np.where(np.isclose(data[:,0], start_potential, atol=0.01))[0]
    
    # Determine the actual start of each cycle by checking the difference between consecutive matches
    cycle_starts = [potential_close_indices[0]]
    for i in range(1, len(potential_close_indices)):
        if potential_close_indices[i] - potential_close_indices[i-1] > 10:  # Ensuring a significant index gap
            cycle_starts.append(potential_close_indices[i])
    return cycle_starts

def plot_cycle_data(data, cycle_starts, cycle_number):
    if cycle_number > len(cycle_starts):
        print("Cycle number exceeds the available cycles.")
        return
    
    start_index = cycle_starts[cycle_number - 1]
    end_index = cycle_starts[cycle_number] if cycle_number < len(cycle_starts) else len(data)
    
    plt.figure(figsize=(10, 5))
    plt.plot(data[start_index:end_index, 0], data[start_index:end_index, 1], label=f'Cycle {cycle_number}')
    plt.xlabel('Potential (V)')
    plt.ylabel('Current (A)')
    plt.title(f'Cycle {cycle_number} from Potential {data[start_index, 0]}V to {data[end_index-1, 0]}V')
    plt.legend()
    plt.grid(True)
    plt.show()

# Usage
file_path = 'path_to_your_file.txt'
data = load_numeric_data(file_path)
cycle_starts = find_cycle_starts(data, -1.3)
plot_cycle_data(data, cycle_starts, 1)  # Plot the first cycle
charges = process_cycles(data, cycle_starts)
print(charges)

Solution

  • The code below finds the area under the curve for the forward sweep.

    Each cycle is first separated into a forward voltage sweep and a reverse sweep. You can then supply the forward cycle data to calculate_fwd_cycle_peak_area(). This function locates the peak by looking at features of the signal's gradient. The function also renders a plot, showing where the various 'landmarks' are that are used to delineate the peak boundaries. The area is computed using numpy's trapezoidal integration, and the result is returned.

    The function is configured with various defaults that I've found to work well on the data you supplied (all the forward cycles in samples 1, 2, and 3).

    An example result: enter image description here

    Usage:

    #Load data
    data = load_numeric_data(f'/mnt/c/dev/CV_sample1.txt')
    
    #Define the start potentials of the forward and reverse sweeps
    fwd_sweep_start_potential = -1.3
    rev_sweep_start_potential = 0.7
    
    #Find the start indices
    fwd_cycle_starts = find_cycle_starts(data, fwd_sweep_start_potential)
    rev_cycle_starts = find_cycle_starts(data, rev_sweep_start_potential)
    
    #Get the cycles into a list
    fwd_cycles_list, rev_cycles_list = split_on_cycles(data, fwd_cycle_starts, rev_cycle_starts)
    
    #Supply each cycle in turn to the area function
    for i, cycle in enumerate(fwd_cycles_list):
        area = calculate_fwd_cycle_peak_area(cycle)
        print('Peak area of cycle', i, 'is:', area)
    

    The full code is below.

    import numpy as np
    from matplotlib import pyplot as plt
    import pandas as pd
    
    def load_numeric_data(filepath):
        data_start_marker = "Potential/V, Current/A"
        numeric_data = []
        
        with open(filepath, 'r') as file:
            # Read through the file until the marker is found
            for line in file:
                if data_start_marker in line:
                    break
            
            # Read the remaining lines and process them
            for line in file:
                parts = line.strip().split(',')
                if len(parts) == 2:
                    try:
                        potential = float(parts[0].strip())
                        current = float(parts[1].strip())
                        numeric_data.append([potential, current])
                    except ValueError:
                        # This handles lines that cannot be converted to floats
                        continue
    
        # Convert list to a NumPy array
        return np.array(numeric_data)
    
    def find_cycle_starts(data, start_potential):
        # Finding all indices where the potential approximately matches the start_potential
        potential_close_indices = np.where(np.isclose(data[:,0], start_potential, atol=0.01))[0]
        
        # Determine the actual start of each cycle by checking the difference between consecutive matches
        cycle_starts = [potential_close_indices[0]]
        for i in range(1, len(potential_close_indices)):
            if potential_close_indices[i] - potential_close_indices[i-1] > 10:  # Ensuring a significant index gap
                cycle_starts.append(potential_close_indices[i])
        return cycle_starts
    
    def split_on_cycles(data, fwd_cycle_starts, rev_cycle_starts):
        """Returns fwd cycles and rev cycles in two lists
        """
        fwd_cycles_list = []
        rev_cycles_list = []
        
        #fwd cycles
        for i, start_pt in enumerate(fwd_cycle_starts[:-1]): #last one is half cycle
            cycle_period = slice(start_pt, rev_cycle_starts[i])
            fwd_cycles_list.append(data[cycle_period, :])
        
        #rev cycles
        for i, start_pt in enumerate(rev_cycle_starts):
            cycle_period = slice(start_pt, fwd_cycle_starts[i + 1])
            rev_cycles_list.append(data[cycle_period, :])
        return fwd_cycles_list, rev_cycles_list
    
    
    def calculate_fwd_cycle_peak_area(
        cycle,
        clip_below_v=0.2,
        nonzero_grad_thresh=1e-6,
        grad_rise_thresh=10e-6
    ):
        """Locates peak and returns the area. Also renders a plot.
        
        Parameters
        ----------
        cycle: Array (n_samples, 2) of [voltages, currents]
        clip_below: Ignore voltages below this value
        nonzero_grad_thresh: Threshold for clipping the flat part before the peak
        grad_rise_thresh: Used to establish the start of the peak
        
        Returns:
        area under the curve
        """
        voltage, current = [cycle[:, i] for i in range(2)]
    
        #Gradients
        current_dx = pd.Series(np.gradient(current)).rolling(window=10).mean().values
        abs_current_dx = np.abs(current_dx)
        # current_d2x = pd.Series(np.gradient(np.gradient(current))).rolling(window=10).mean().values
    
        f, ax = plt.subplots(figsize=(11, 4.5))
        ax.plot(voltage, current, color='black', linewidth=6, label='data')
        ax.set(xlabel='voltage', ylabel='current')
    
        grad_clr = 'navy'
        ax2 = ax.twinx()
        ax2.plot(voltage, current_dx, color=grad_clr, linewidth=1.5, label='gradient')
        ax2.axhline(0, color=grad_clr, linewidth=1, linestyle=':')
        ax2.set_ylabel('gradient')
        ax2.tick_params(axis='y', colors=grad_clr)
        ax2.yaxis.label.set_color(grad_clr)
        ax2.spines.right.set_color(grad_clr)
    
        cmap = plt.get_cmap('tab10')
        line_fmt = dict(linestyle='--', linewidth=2, alpha=1)
    
        #"rel_start" tracks the starting point for each subsequent operation
        ax.axvline(voltage[rel_start := 0], color=cmap(0), **line_fmt, label='cycle starts')
    
        #Hard clip. Ignore voltages below clip_below_v
        clip_idx = np.argwhere(voltage > clip_below_v)[0].item()
        ax.axvline(voltage[rel_start + clip_idx], color=cmap(1), **line_fmt, label='hard clip ends')
        rel_start += clip_idx
    
        #Clip the flat part
        flatpart_endidx = np.argwhere(abs_current_dx[rel_start:] > nonzero_grad_thresh)[0].item()
        ax.axvline(voltage[rel_start + flatpart_endidx], color=cmap(2), **line_fmt, label='flat part ends')
        rel_start += flatpart_endidx
    
        #Peak starts where the gradient begins to rise
        peak_start_idx = np.argwhere(abs_current_dx[rel_start:] > grad_rise_thresh)[0].item()
        ax.axvline(voltage[rel_start + peak_start_idx], color=cmap(3), **line_fmt, label='peak_start_idx')
        peak_x_indices = [rel_start + peak_start_idx]
        rel_start += peak_start_idx
    
        #Find the midpoint of the downwards slope
        down_slope_idx = np.argmin(current_dx[rel_start:]).item()
        ax.axvline(voltage[rel_start + down_slope_idx], color=cmap(5), **line_fmt, label='down_slope_idx')
        rel_start += down_slope_idx
    
        #The gradient point closest to 0 after the downwards slope is where the peak ends 
        peak_end_idx = np.argmin(abs_current_dx[rel_start:]).item()
        peak_x_indices.append(rel_start + peak_end_idx)
        ax.axvline(voltage[rel_start + peak_end_idx], color=cmap(6), **line_fmt, label='peak_end_idx')
    
        f.legend(
            fontsize=8.5, loc='upper left', ncols=10, shadow=True, framealpha=1,
            bbox_to_anchor=(0.05, 0)
        )
        ax.set_xlim(voltage[clip_idx + flatpart_endidx - 50], voltage.max() * 0.9)
        ax.set_ylim(-0.005, 0.0075)
        ax2.set_ylim(ax2.get_ylim()[0] * 0.83, ax2.get_ylim()[1] * 0.8)
    
        #Points denoting line
        ax.scatter(
            voltage[peak_x_indices], current[peak_x_indices],
            marker='o', color='lime', s=40, zorder=3,
        )
    
        peak_indices_all = np.arange(*peak_x_indices)
        peak_voltages = voltage[peak_indices_all]
        peak_currents = current[peak_indices_all]
    
        line_voltages = peak_voltages[[0, -1]]
        line_currents = peak_currents[[0, -1]]
        line_poly = np.polynomial.Polynomial.fit(line_voltages, line_currents, deg=1)
        ax.plot(line_voltages, line_poly(line_voltages), linewidth=3, color='lime')
        ax.plot(peak_voltages, peak_currents, color='lime', linewidth=1.8)
        ax.fill_between(
            x=peak_voltages, y1=line_poly(peak_voltages), y2=peak_currents,
            hatch=None, facecolor='lime', alpha=0.25
        )
    
        peak_area = np.trapz(peak_currents, peak_voltages)
        triangle_area = 0.5 * np.ptp(line_voltages) * line_currents.max()
        peak_area -= triangle_area #subtract the triangle under the line
        ax.set_title(f'shaded area: {peak_area:>.2e}', weight='bold')
        plt.show()
        
        return peak_area
    
    #Load data
    data = load_numeric_data(f'/mnt/c/dev/CV_sample1.txt')
    
    #Define the start potentials of the forward and reverse sweeps
    fwd_sweep_start_potential = -1.3
    rev_sweep_start_potential = 0.7
    
    #Find the start indices
    fwd_cycle_starts = find_cycle_starts(data, fwd_sweep_start_potential)
    rev_cycle_starts = find_cycle_starts(data, rev_sweep_start_potential)
    
    #Get the cycles into a list
    fwd_cycles_list, rev_cycles_list = split_on_cycles(data, fwd_cycle_starts, rev_cycle_starts)
    
    for i, cycle in enumerate(fwd_cycles_list):
        area = calculate_fwd_cycle_peak_area(cycle)
        print('Peak area of cycle', i, 'is:', area)
    
    #For debugging: view the data splitting results
    if False:
        plt.plot(data[:, 0])
        [plt.gca().axvline(x, linewidth=1, linestyle=':', color='tab:red') for x in fwd_cycle_starts]
        [plt.gca().axvline(x, linewidth=1, linestyle=':', color='black') for x in rev_cycle_starts]
        plt.gcf().set_size_inches(8, 2)
        plt.gca().set(
            xlabel='index', ylabel='V',
            title=f'{len(fwd_cycles_list)} fwd cycles/{len(rev_cycles_list)} rev cycles'
        )
        plt.show()
    
        for cycles_list in [fwd_cycles_list, rev_cycles_list]:
            #View split cycles
            for c in cycles_list:
                plt.plot(c[:, 0], c[:, 1])
            plt.gca().set(xlim=(0.25, 0.65), ylim=(-0.0075, 0.0075))
            plt.gca().set(xlabel='V', ylabel='I')
            plt.gcf().set_size_inches(8, 3)
            plt.show()