I have this function to convert a binary 2d array to a byte array:
def flatten_and_pad_to_multiple_of_8(binary_matrix):
# Step 1: Calculate the size of the original flattened array
rows, cols = binary_matrix.shape
current_length = rows * cols
# Step 2: Calculate the required length that is a multiple of 8
padded_length = ((current_length + 7) // 8) * 8
# Step 3: Initialize flat_bits with the required padded length
flat_bits = np.zeros(padded_length, dtype=np.uint8)
# Step 4: Fill flat_bits with values from the binary matrix
idx = 0
for i in range(rows):
for j in range(cols):
flat_bits[idx] = binary_matrix[i, j]
idx += 1
return flat_bits
def matrix_to_ascii(matrix):
flat_bits = flatten_and_pad_to_multiple_of_8(matrix)
# Convert the flattened bits into bytes
ascii_string = ""
for i in range(0, len(flat_bits), 8):
byte = 0
for j in range(8):
byte = (byte << 1) | flat_bits[i + j]
ascii_char = chr(byte)
ascii_string += ascii_char
return ascii_string
If
matrix = np.array([[0, 1, 1, 1, 1],
[1, 0, 1, 1, 1],
[1, 1, 0, 1, 1],
[1, 1, 1, 0, 1],
[1, 1, 1, 1, 0]], dtype=uint8)
then matrix_to_ascii(matrix) is '}֧\x00' although it is a string. I then have to do matrix_to_ascii(matrix).encode(). My problem is in converting it back to a matrix.
I will first convert the string to a byte array to save space. I need to save space in my code. Here is the broken code to convert it back to a matrix:
def ascii_to_matrix(byte_array, original_shape):
"""
ascii_string must be a bytestring before it is passed in.
"""
# Initialize the binary matrix with the original shape
rows, cols = original_shape
binary_matrix = np.zeros((rows, cols), dtype=np.uint8)
# Fill the binary matrix with bits from the byte array
bit_idx = 0
for byte in byte_array:
for j in range(8):
if bit_idx < rows * cols:
binary_matrix[bit_idx // cols, bit_idx % cols] = (byte >> (7 - j)) & 1
bit_idx += 1
else:
break
return binary_matrix
Unfortunately, it gives the wrong output:
ascii_to_matrix(matrix_to_ascii(matrix).encode(), (5, 5))
array([[0, 1, 1, 1, 1],
[1, 0, 1, 1, 1],
[0, 0, 0, 0, 1],
[1, 1, 0, 1, 1],
[0, 1, 1, 1, 1]], dtype=uint8)
What am I doing wrong?
(I am not using any fancier numpy functions as I will want to speed this all up with numba. In particular, I can't use packbits or tobytes as they are not supported by numba. I also can't use bytes or bytearray.)
Your ascii_to_matrix
seems to work fine if you encode your string to a 1-byte-per-character encoding; .encode()
defaults to UTF-8, and some of the characters in your matrix's "text" representation will end up being two bytes in UTF-8.
print(x := matrix_to_ascii(matrix))
print(f"{x.encode()=!r}")
print(f"{x.encode('iso-8859-1')=!r}")
print(z := ascii_to_matrix(x.encode("iso-8859-1"), matrix.shape))
assert np.array_equal(matrix, z)
prints out
}֧
x.encode()=b'}\xc3\xb7\xc3\x9f\x00'
x.encode('iso-8859-1')=b'}\xf7\xdf\x00'
[[0 1 1 1 1]
[1 0 1 1 1]
[1 1 0 1 1]
[1 1 1 0 1]
[1 1 1 1 0]]
– note how the default encoding is not the same as the single-byte encoding.