#After exporting data into date frame it looks like below
data 1 | |
---|---|
id | datum |
2000 | 2024.09.02 |
2903 | 2024.09.02 |
data 2 | |
id | datum |
4000 | 2024.09.02 |
4001 | 2024.09.02 |
#expected answer should look like this
id | datum | data |
---|---|---|
2000 | 2024.09.02 | 1 |
2903 | 2024.09.02 | 1 |
4000 | 2024.09.02 | 2 |
4001 | 2024.09.02 | 2 |
import pandas as pd
import numpy as np
# Step 1: Read the Excel file
# Replace 'your_file_path.xlsx' with the actual path to your Excel file
file_path = 'your_file_path.xlsx'
# Load the Excel file
xls = pd.ExcelFile(file_path)
# Assuming 'Datei2' is the name of the sheet we want to read
df = pd.read_excel(xls, sheet_name='Datei2')
df['Column1 - Copy'] = df['Column1']
split_columns = df['Column1 - Copy'].str.split(' ', expand=True)
split_columns.columns = [f'Column1 - Copy.{i+1}' for i in range(split_columns.shape[1])]
df = pd.concat([df, split_columns], axis=1)
# Step 6: Fill Down Values
df['Column1 - Copy.16'] = df['Column1 - Copy.16'].fillna(method='ffill')
Assuming your excel file looks exactly like you put, you can use cumsum
to put your data into groups and then remove any unnecessary data by filtering the dataframe:
df = pd.read_excel(<your file>, header=None, names=['id', 'datum'])
id datum
0 data 1 NaN
1 id datum
2 2000 2024.09.02
3 2903 2024.09.02
4 data 2 NaN
5 id datum
6 4000 2024.09.02
7 4001 2024.09.02
df['data'] = (df['datum'].isna()).cumsum()
id datum data
0 data 1 NaN 1
1 id datum 1
2 2000 2024.09.02 1
3 2903 2024.09.02 1
4 data 2 NaN 2
5 id datum 2
6 4000 2024.09.02 2
7 4001 2024.09.02 2
df = df[~df['datum'].eq('datum')].dropna(how='any')
id datum data
2 2000 2024.09.02 1
3 2903 2024.09.02 1
6 4000 2024.09.02 2
7 4001 2024.09.02 2