I am a bit stuck in a simple problem but I seem to have lost grey matter and keep looping (which is exactly a problem here). I am trying to upload the files from local to azure storage and then append the file name to the file in the cloud so later I can add and check if I have already processed such file.
Here is my code:
import os
from azure.identity import DefaultAzureCredential
from azure.storage.filedatalake import DataLakeServiceClient,DataLakeFileClient
# Iterate over files in the local directory
for filename in os.listdir(local_path):
if os.path.isfile(os.path.join(local_path, filename)):
# Get a reference to the file client
file_client = directory_client.get_file_client(filename)
# Upload the file to Azure Data Lake Storage
with open(os.path.join(local_path, filename), "rb") as local_file:
file_client.upload_data(local_file, overwrite=True)
if not checkpoint_directory_client.create_file(file=checkpoint_file_name):
checkpoint_directory_client.create_file(file=checkpoint_file_name)
checkpoint_file_client = checkpoint_directory_client.get_file_client(checkpoint_file_name)
*<<<I think I need something here but I tried everything and I cannot seem to do it>>>>*
I tried with DataLakeFileClient as well but to no avail. I either can upload one name to the file (last one which is indicative of the position inside a loop)or get empty file. I just need a little push in the right direction!
I am trying something like this
checkpoint_directory_client.create_file(file=checkpoint_file_name)
checkpoint_file_client = checkpoint_directory_client.get_file_client(checkpoint_file_name)
data = filename
checkpoint_file_client.append_data(data,offset=0,length=len(data))
checkpoint_file_client.flush_data(len(data))
Thanks a bunch
Append Filename to File in Azure Storage
You can use the code below to append the filename to the file after creating the checkpoint file, using the append_data
method of the DataLakeFileClient
object.
Here is the full code:
Code:
import os
from azure.identity import DefaultAzureCredential
from azure.storage.filedatalake import DataLakeServiceClient, DataLakeFileClient
try:
account_url = "https://xxxx.dfs.core.windows.net/"
default_credential = DefaultAzureCredential()
service_client = DataLakeServiceClient(account_url, credential=default_credential)
container_name = "test"
directory_name = "sample"
filesystem_name = container_name
directory_name = directory_name
checkpoint_directory_name = "checkpoint"
checkpoint_file_name = "checkpointfile.txt"
local_path = r"C:\Users\xxxx\Documents\venkat1"
if not os.path.exists(local_path):
os.mkdir(local_path)
directory_client = service_client.get_file_system_client(filesystem_name).get_directory_client(directory_name)
checkpoint_directory_client = service_client.get_directory_client(file_system=filesystem_name, directory=f"{directory_name}/{checkpoint_directory_name}")
checkpoint_file_client = checkpoint_directory_client.get_file_client(checkpoint_file_name)
checkpoint_file_service = DataLakeFileClient(account_url=account_url, credential=default_credential, file_system_name=filesystem_name, file_path=checkpoint_file_client.path_name)
# Iterate over files in the local directory
for filename in os.listdir(local_path):
if os.path.isfile(os.path.join(local_path, filename)):
file_client = directory_client.get_file_client(filename)
with open(os.path.join(local_path, filename), "rb") as local_file:
file_client.upload_data(local_file, overwrite=True)
checkpoint_directory_client.create_file(file=checkpoint_file_name)
checkpoint_file_client = checkpoint_directory_client.get_file_client(checkpoint_file_name)
data = filename.encode('utf-8')
checkpoint_file_client.append_data(data, offset=0, length=len(data))
checkpoint_file_client.flush_data(len(data))
print("All files uploaded successfully.")
except Exception as ex:
print('Exception:')
print(ex)
The above code uploads files from a local directory to Azure Data Lake Storage and creates a checkpoint file
to keep track of the uploaded files. For each file, it appends the filename to the checkpoint file using the append_data
method of the DataLakeFileClient object.
Output:
All files uploaded successfully.
Portal:
Reference:
Use Python to manage data in Azure Data Lake Storage Gen2 - Azure Storage | Microsoft Learn
Update:
To write the all files into the (checkpoint)file you can modify code as like below:
Code:
for filename in os.listdir(local_path):
if os.path.isfile(os.path.join(local_path, filename)):
file_name, file_extension = os.path.splitext(filename)
new_filename = f"{file_name}.{file_extension[1:]}"
file_client = directory_client.get_file_client(filename)
with open(os.path.join(local_path, filename), "rb") as local_file:
file_client.upload_data(local_file, overwrite=True)
data = f"{new_filename}\n".encode('utf-8')
offset = checkpoint_file_client_new.get_file_properties().size
checkpoint_file_client_new.append_data(data, offset=offset, length=len(data))
checkpoint_file_client_new.flush_data(offset + len(data))
Output: