I have got data nestled in folders and subfolders within Azure Data Lake. Each piece of data comes with a file name, and in ADLS, we can view the modified time. Now, I'm looking to calculate the total storage size of the data stored within a particular timeframe. How can I access files within that specific time range and calculate the total data size?
Below code should work after providing the start time and end time for the range you want to get the size -
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
# Set up Spark session
spark = SparkSession.builder.appName("FileCount").getOrCreate()
# ADLS Gen2 storage account details
account_name = "<account-name"
container_name = "<container_name>"
relative_path = "ADL_STG_NEW/attrep_change*"
# Define the start and end timestamps
start_timestamp = datetime.strptime("2023-11-16 00:00:00", "%Y-%m-%d %H:%M:%S")
end_timestamp = datetime.strptime("2023-11-17 00:00:00", "%Y-%m-%d %H:%M:%S")
# Convert timestamps to milliseconds for comparison
start_timestamp_ms = int(start_timestamp.timestamp()) * 1000
end_timestamp_ms = int(end_timestamp.timestamp()) * 1000
print(start_timestamp_ms)
print(end_timestamp_ms)
# ADLS Gen2 path
adls_base_path = f"abfss://{container_name}@{account_name}.dfs.core.windows.net/{relative_path}"
def get_dir_content(ls_path):
for dir_path in dbutils.fs.ls(ls_path):
if dir_path.isDir() and ls_path != dir_path.path:
yield dir_path.path
list(get_dir_content(adls_base_path))
for adls_path in list(get_dir_content(adls_base_path)):
file_list = dbutils.fs.ls(adls_path)
filtered_files = [
(file.name, file.modificationTime, file.size) for file in file_list
if start_timestamp_ms <= file.modificationTime <= end_timestamp_ms
]
file_count = len(filtered_files)
total_size = sum(file_info[2] for file_info in filtered_files)
total_size = total_size/(1024*1024)
print(f"{adls_path},{total_size},{file_count}")
start_timestamp_ms = int(start_timestamp.timestamp()) * 1000
end_timestamp_ms = int(end_timestamp.timestamp()) * 1000
file_list = dbutils.fs.ls(adls_path)
filtered_files = [
(file.name, file.modificationTime, file.size) for file in file_list
if start_timestamp_ms <= file.modificationTime <= end_timestamp_ms
]
file_count = len(filtered_files)
print(f"Number of files created between {start_timestamp} and {end_timestamp}: {file_count}")
total_size = sum(file_info[2] for file_info in filtered_files)
total_size = total_size/(1024*1024)
print(f"{relative_path},{total_size},{file_count}")
print(f"Total size of files: {total_size} MegaBytes")