Don't know if this is a duplicate, anyway my Google-fu is weak and Google almost never finds anything relevant if I type more than five words.
I am trying to parse Master File Table as located by "//?/X:/$MFT"
, I know trying to open it directly will raise PermissionDenied
. Of course I have figured a way to circumvent it. By opening "//?/X:"
this creates a handle that lets me to read the boot sector, I can then read the MFT from there...
I have already written the code, or at least the vast majority of it, I can already read all of the MFT into primary memory but at this stage the memory usage is high and the information is not well organized. But I have parsed all information I wanted with the help of this documentation.
You can see my code here.
As you can see from my code, I use a lot of offsets to slice the bytes into chunks and call corresponding functions to decode these chunks iteratively, I will show you what I mean:
from typing import NamedTuple
class Record_Header_Flags(NamedTuple):
In_Use: bool
Directory: bool
Extension: bool
Special_Index: bool
class Record_Header(NamedTuple):
LogFile_Serial: int
Written: int
Hardlinks: int
Flags: Record_Header_Flags
Record_Size: int
Base_Record: int
Base_Writes: int
Record_ID: int
HEADER_FLAGS = (1, 2, 4, 8)
def parse_signed_little_endian(data: bytes) -> int:
return (
-1 * (1 + sum((b ^ 0xFF) * (1 << i * 8) for i, b in enumerate(data)))
if data[-1] & 128
else int.from_bytes(data, "little")
)
def parse_little_endian(data: bytes) -> int:
return int.from_bytes(data, "little")
def parse_header_flags(data: bytes) -> Record_Header_Flags:
flag = data[0]
return Record_Header_Flags(*(bool(flag & bit) for bit in HEADER_FLAGS))
FILE_RECORD_HEADER = (
(8, 16, parse_little_endian),
(16, 18, parse_little_endian),
(18, 20, parse_little_endian),
(22, 24, parse_header_flags),
(24, 28, parse_little_endian),
(32, 38, parse_little_endian),
(38, 40, parse_little_endian),
(44, 48, parse_little_endian),
)
def parse_record_header(data: bytes) -> Record_Header:
return Record_Header(
*(func(data[start:end]) for start, end, func in FILE_RECORD_HEADER)
)
data = b"FILE0\x00\x03\x00\x9dt \x13\x0c\x00\x00\x00\x08\x00\x02\x008\x00\x01\x00\xd8\x01\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\xff\xff\x00\x00"
print(parse_record_header(data))
Record_Header(LogFile_Serial=51860501661, Written=8, Hardlinks=2, Flags=Record_Header_Flags(In_Use=True, Directory=False, Extension=False, Special_Index=False), Record_Size=472, Base_Record=0, Base_Writes=0, Record_ID=65535)
Someone told me this is inefficient and unPythonic, and the proper way to do this is to use a combination of struct
and ctypes
.
I know I can parse 4 bytes little endian sequences using struct.unpack("<i", data)[0]
, 4 bytes unsigned LE with this format string: "<I"
, 8 bytes LE with "<q"
and 8 bytes ULE with "<Q"
. But some values are sequences of 6 bytes.
And I don't know how to use ctypes
structures.
Further MFT uses non-standard formats like Windows File Time:
from datetime import datetime, timedelta
from typing import NamedTuple
EPOCH = datetime(1601, 1, 1, 0, 0, 0)
def parse_NTFS_timestamp(data: bytes) -> datetime:
return EPOCH + timedelta(seconds=int.from_bytes(data, "little") * 1e-7)
How would one use ctypes
and struct
to parse the example I have given, and parse byte sequences containing non-standard encodings for example timestamp fields from 0x10 $STANDARD_INFORMATION?
Here is an example of both struct
and ctypes
parsing the data. Refer to the struct
Format Characters and ctypes
Structures and unions:
import ctypes as ct
import struct
from collections import namedtuple
class RecordHeader(ct.Structure):
_fields_ = (('pad1', 8 * ct.c_uint8),
('LogFile_Serial', ct.c_uint64),
('Written', ct.c_uint16),
('HardLinks', ct.c_uint16),
('pad2', 2 * ct.c_uint8),
('Flags', ct.c_uint16),
('Record_Size', ct.c_uint32),
('pad3', 4 * ct.c_uint8),
('_Base_Record_Base_Writes', ct.c_uint64),
('pad4', 4 * ct.c_uint8),
('Record_ID', ct.c_uint32))
# ctypes can't handle a 6-byte field. Read as 8-byte field and parse with
# properties using bit shifting and masking.
@property
def Base_Record(self):
return self._Base_Record_Base_Writes & 0xFFFFFFFFFFFF
@property
def Base_Writes(self):
return self._Base_Record_Base_Writes >> 48
def __repr__(self):
'''Display representation of the structure'''
# Gather relevant fields and values.
slist = [(k, getattr(self,k)) for k, _ in self._fields_ if not k.startswith('pad')]
params = ', '.join([f'{k}={v}' for k, v in slist])
# format the parameters for a nice display. Add the properties as well.
return f'RecordHeader({params}, Base_Record={self.Base_Record}, Base_Writes={self.Base_Writes})'
# OP data, but modified to test the Base_Record/Base_Writes fields with something non-zero.
data = b'FILE0\x00\x03\x00\x9dt \x13\x0c\x00\x00\x00\x08\x00\x02\x008\x00\x01\x00\xd8\x01\x00\x00\x00\x04\x00\x00\x88\x77\x66\x55\x44\x33\x22\x11\x05\x00\x00\x00\xff\xff\x00\x00'
# struct can't handle a 6-byte field either. Read as an 8-byte field. Process later as needed.
result = struct.unpack('<8xQHH2xHL4xQ4xL', data)
RHeader = namedtuple('RHeader', 'LogFile_Serial Written HardLinks Flags Record_Size Base_Record_Base_Writes Record_ID')
print(RHeader(*result))
# ctypes Structure example. Processes the 6-byte/2-byte field.
rh = RecordHeader.from_buffer_copy(data)
print(rh)
print(f'{rh._Base_Record_Base_Writes=:#x} {rh.Base_Record=:#x} {rh.Base_Writes=:#x}')
Output:
RHeader(LogFile_Serial=51860501661, Written=8, HardLinks=2, Flags=1, Record_Size=472, Base_Record_Base_Writes=1234605616436508552, Record_ID=65535)
RecordHeader(LogFile_Serial=51860501661, Written=8, HardLinks=2, Flags=1, Record_Size=472, _Base_Record_Base_Writes=1234605616436508552, Record_ID=65535, Base_Record=56368583571336, Base_Writes=4386)
rh._Base_Record_Base_Writes=0x1122334455667788 rh.Base_Record=0x334455667788 rh.Base_Writes=0x1122
Note that parsing timestamps is another question. Stick to one at a time on SO. Ask another question with a specific example such as the raw data and the expected result. Add your coding attempt.