pythonazureazure-blob-storage

Python script to join 2 csv files with an existing excel file in Azure blob storage


There is an excel file with 4 worksheets (named tab2, tab3, tab4 and tab6) in blob storage. I have a pipeline that generates 2 csv files. I want to add the 1st csv file into the said excel file with the csv file appearing as the 1st worksheet (tab1). The 2nd csv file should be added into the excel file as the penultimate worksheet (tab5). I have a sample python script:

from azure.storage.blob import BlobServiceClient
import pandas as pd
from io import BytesIO
from datetime import date

today = date.today().strftime("%Y%m%d")


connection_string = "DefaultEndpointsProtocol=https;AccountName=MyAccount;AccountKey=accountkey;EndpointSuffix=myendpoint.blob.core.windows.net/myproj"

container_name = "MyContainer/Prj1"
excel_blob_name = "My Excel.xlsx"
csv_blob_1_name = "My_csv1_"+today+".csv"
csv_blob_2_name = "My_csv2_"+today+".csv"

# Initialize Blob Service Client
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Read Excel file from Blob Storage
excel_blob_client = blob_service_client.get_blob_client(container=container_name, blob=excel_blob_name)
excel_blob_data = BytesIO(excel_blob_client.download_blob().readall())

# Read CSV files from Blob Storage
csv_blob_client_1 = blob_service_client.get_blob_client(container=container_name, blob=csv_blob_1_name)
csv_blob_data_1 = BytesIO(csv_blob_client_1.download_blob().readall())
df_csv_1 = pd.read_csv(csv_blob_data_1)

csv_blob_client_2 = blob_service_client.get_blob_client(container=container_name, blob=csv_blob_2_name)
csv_blob_data_2 = BytesIO(csv_blob_client_2.download_blob().readall())
df_csv_2 = pd.read_csv(csv_blob_data_2)

# Load the Excel file in memory
with pd.ExcelWriter(excel_blob_data, mode="a", engine="openpyxl") as excel_writer:
    df_csv_1.to_excel(excel_writer, sheet_name="tab1", index=False)
    df_csv_2.to_excel(excel_writer, sheet_name="tab5", index=False)

# Save modified Excel file back to Blob Storage
excel_blob_data.seek(0)  # Reset stream position
excel_blob_client.upload_blob(excel_blob_data, overwrite=True)

Edit: Adding the full traceback as requested by JonSG.

Traceback (most recent call last):
  File "d:\batch\tasks\workitems\adfv2-ProjCont\job-1\3e150412-3328-4b72-b09b-29494f29396b\wd\csv_join_excel.py", line 23, in <module>
    excel_blob_data = BytesIO(excel_blob_client.download_blob().readall())
                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\core\tracing\decorator.py", line 119, in wrapper_use_tracer
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\storage\blob\_blob_client.py", line 753, in download_blob
    return StorageStreamDownloader(**options)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\storage\blob\_download.py", line 403, in __init__
    self._response = self._initial_request()
                     ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\storage\blob\_download.py", line 456, in _initial_request
    location_mode, response = cast(Tuple[Optional[str], Any], self._clients.blob.download(
                                                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\core\tracing\decorator.py", line 119, in wrapper_use_tracer
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\storage\blob\_generated\operations\_blob_operations.py", line 1641, in download
    pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
                                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\core\pipeline\_base.py", line 242, in run
    return first_node.send(pipeline_request)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\core\pipeline\_base.py", line 98, in send
    response = self.next.send(request)
               ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\core\pipeline\_base.py", line 98, in send
    response = self.next.send(request)
               ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\core\pipeline\_base.py", line 98, in send
    response = self.next.send(request)
               ^^^^^^^^^^^^^^^^^^^^^^^
  [Previous line repeated 2 more times]
  File "C:\Program Files\Python312\Lib\site-packages\azure\core\pipeline\policies\_redirect.py", line 205, in send
    response = self.next.send(request)
               ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\core\pipeline\_base.py", line 98, in send
    response = self.next.send(request)
               ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\storage\blob\_shared\policies.py", line 555, in send
    raise err
  File "C:\Program Files\Python312\Lib\site-packages\azure\storage\blob\_shared\policies.py", line 527, in send
    response = self.next.send(request)
               ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\core\pipeline\_base.py", line 98, in send
    response = self.next.send(request)
               ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\core\pipeline\_base.py", line 98, in send
    response = self.next.send(request)
               ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\core\pipeline\_base.py", line 98, in send
    response = self.next.send(request)
               ^^^^^^^^^^^^^^^^^^^^^^^
  [Previous line repeated 1 more time]
  File "C:\Program Files\Python312\Lib\site-packages\azure\storage\blob\_shared\policies.py", line 301, in send
    response = self.next.send(request)
               ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\core\pipeline\_base.py", line 98, in send
    response = self.next.send(request)
               ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\core\pipeline\_base.py", line 98, in send
    response = self.next.send(request)
               ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\core\pipeline\_base.py", line 130, in send
    self._sender.send(request.http_request, **request.context.options),
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\storage\blob\_shared\base_client.py", line 353, in send
    return self._transport.send(request, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\Python312\Lib\site-packages\azure\core\pipeline\transport\_requests_basic.py", line 409, in send
    raise error
azure.core.exceptions.ServiceRequestError: <urllib3.connection.HTTPSConnection object at 0x00000222F95FA240>: Failed to resolve 'odsblobcontainer.blob.odsblobcontainer.blob.core.windows.net' ([Errno 11001] getaddrinfo failed)

Can someone please help me?


Solution

  • Failed to resolve 'odsblobcontainer.blob.odsblobcontainer.blob.core.windows.net'

    The error mentioned above is due to a DNS resolution issue. It occurs because the connection_string format is incorrect, resulting in a malformed blob endpoint URL that repeats part of the hostname.

    The connection string format: DefaultEndpointsProtocol=https;AccountName=<Your storage account name>;AccountKey=<Your storage account key>;EndpointSuffix=core.windows.net

    You can use the below code that will add the csv1 and csv2 record to excel sheet using Python SDK.

    Code:

    from azure.storage.blob import BlobServiceClient
    import pandas as pd
    from io import BytesIO
    from openpyxl import load_workbook
    from openpyxl.utils.dataframe import dataframe_to_rows
    from datetime import date
    
    
    today = date.today().strftime("%Y%m%d")
    connection_string = "DefaultEndpointsProtocol=https;AccountName=venkat326123;AccountKey=redacted;EndpointSuffix=core.windows.net"
    container_name = "<container name>"
    prefix = "prj1/"
    excel_blob_name = prefix + "My Excel.xlsx"
    csv_blob_1_name = prefix + f"My_csv1_{today}.csv"
    csv_blob_2_name = prefix + f"My_csv2_{today}.csv"
    
    
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    excel_blob_client = blob_service_client.get_blob_client(container=container_name, blob=excel_blob_name)
    
    
    print("Downloading Excel file...")
    excel_stream = BytesIO(excel_blob_client.download_blob().readall())
    workbook = load_workbook(excel_stream)
    
    # Read CSVs into DataFrames
    def read_csv_from_blob(blob_name):
        blob = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
        stream = BytesIO(blob.download_blob().readall())
        return pd.read_csv(stream)
    
    df_csv_1 = read_csv_from_blob(csv_blob_1_name)
    df_csv_2 = read_csv_from_blob(csv_blob_2_name)
    
    # Remove tab1 and tab5 if already present
    for sheet in ["tab1", "tab5"]:
        if sheet in workbook.sheetnames:
            workbook.remove(workbook[sheet])
    
    # Create worksheets for tab1 and tab5
    ws_tab1 = workbook.create_sheet("tab1")
    ws_tab5 = workbook.create_sheet("tab5")
    
    # Write data to tab1
    for row in dataframe_to_rows(df_csv_1, index=False, header=True):
        ws_tab1.append(row)
    
    # Write data to tab5
    for row in dataframe_to_rows(df_csv_2, index=False, header=True):
        ws_tab5.append(row)
    
    # Reorder sheets: tab1 should be 1st, tab5 second-last
    sheets = workbook.sheetnames.copy()
    sheets.remove("tab1")
    sheets.remove("tab5")
    
    # Insert tab1 at beginning, tab5 before last sheet
    final_order = ["tab1"] + sheets[:-1] + ["tab5", sheets[-1]]
    workbook._sheets = [workbook[sheet] for sheet in final_order]
    
    # Save updated Excel and upload
    out_stream = BytesIO()
    workbook.save(out_stream)
    out_stream.seek(0)
    excel_blob_client.upload_blob(out_stream, overwrite=True)
    
    print("Excel updated with tab1 as first and tab5 as penultimate sheet.")
    print("Final sheet order:", workbook.sheetnames)
    

    Output:

    Connecting to Azure Blob Storage...
    Downloading Excel file...
     Excel updated with tab1 as first and tab5 as penultimate sheet.
    Final sheet order: ['tab1', 'tab5', 'Sheet1']
    

    enter image description here

    enter image description here