I'm trying to write an md5 hash function in R
without calling any C-routines. While my code executes just fine, the output never matches tools::md5sum
(which does match examples provided in various online documents). I suspect a byte-order (or word-order) problem somewhere; as can be seen in the code provided below I have tried inserting some flipflops but without success.
I have checked for simple mismatches in the output, such as the correct bytes but in the wrong order, but no luck.
To run this function, you'll need the libraries Rmpfr
and bigBits
as well as the functions provided here (flip32
, flip16
, and bigRotate
).
# need these to run:
library(Rmpfr)
library(bigBits)
mymd5 <- function(msg){
if(!is(msg,'raw')) msg <- charToRaw(as.character(msg))
# a const to be used when truncating values > ffffffff
two32 <- as.bigz(2^(as.bigz(32)))
sidx <- c( 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22 , 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20 , 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23 , 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21 )
# K-values verified to be correct.
K <- as.bigz(2^32 * abs(sin(1:64)))
# IETF shows these values as-is, i.e. don't change Endian on these calculations
#ietf.org page says A is 01234567, ie lower bytes first.
adef <- '67452301'
bdef <- 'efcdab89'
cdef <- '98badcfe'
ddef <- '10325476'
# then optionally
adef <- flip32(adef)
bdef <- flip32(bdef)
cdef <- flip32(cdef)
ddef <- flip32(ddef)
a0 <- (base2base(adef,16,10))[[1]]
b0 <-(base2base(bdef,16,10))[[1]]
c0 <-(base2base(cdef,16,10))[[1]]
d0 <-(base2base(ddef,16,10))[[1]]
# append 0x80 and pad with 0x00 bytes so that the message length in bytes ≡ 56 (mod 64).
origlenbit <- length(msg) * 8
msg <- c(msg, as.raw('0x80') )
bytemod <- length(msg)%%64
raw00 <- as.raw('0x00')
# append '0x00' 56-bytemod times
if (bytemod < 56 && bytemod > 0 ) {
msg <- c(msg,rep(raw00,times=(56-bytemod)))
} else if (bytemod > 56) {
msg <- c(msg, rep(raw00, times = bytemod-56))
}
# https://datatracker.ietf.org/doc/html/rfc1321#section-3.4 says
# A 64-bit representation of b (the length of the message before the
# padding bits were added) is appended to the result of the previous
# step. In the unlikely event that b is greater than 2^64, then only
# the low-order 64 bits of b are used. (These bits are appended as two
# 32-bit words and appended low-order word first in accordance with the
# previous conventions.)
len16 <- base2base(origlenbit,10,16)[[1]]
lenbit2 <- base2base(origlenbit,10,2)[[1]]
if(nchar(lenbit2) < 64 ){
lenbit2 <- paste0(c(rep('0',times = 64 -nchar(lenbit2)) ,lenbit2), collapse='')
} else if (nchar(lenbit2) > 64) lenbit2 <- rev(rev(lenbit2)[1:64])
newbytes2 <- substring(lenbit2, seq(1,nchar(lenbit2)-7,by=8), last = seq(8, nchar(lenbit2),by = 8) )
newdec <- base2base(newbytes2,2,10)
newdbl <- rep(0,times= 8)
for (jd in 1: 8) newdbl[jd] <- as.numeric(newdec[[jd]])
newraw <- as.raw(newdbl)
# which order is correct?
# msg <- c(msg,newraw[c(7,8,5,6,3,4,1,2)])
msg <- c(msg, newraw[5:8], newraw[1:4])
# msg <- c(msg, newraw[1:4], newraw[5:8])
chunkit <- as.bigz(as.numeric(msg ) ) # all these will be <= 0xff
## turn groups of 512 BITS of msg into 16 words of 32 bits each.
for(jchunk in 1: (length(msg)/64)) {
thischunk <- matrix(chunkit[ ((jchunk-1)*64)+(1:64)],ncol=16)
# should I reverse the bytes of this chunk?
thisM <- as.bigz(rep(0,16) )
for(jm in 1:16) thisM[jm] <- sum(thischunk[,jm] * c(256^3,256^2,256,1) )
# for(jm in 1:16) thisM[jm] <- sum(thischunk[,jm] * c(1,256,256^2,256^3) )
# Initialize hash value for this chunk:
Ah<- a0
Bh<- b0
Ch<- c0
Dh<- d0
#now the chunk loop, which I make into 4 sep'te loops
for (jone in 1:16) {
Fh <- bigOr (bigAnd(Bh,Ch, inTwo=FALSE)[[1]] ,bigAnd(bigNot(Bh, inTwo=FALSE, outTwo=FALSE)[[1]],Dh, inTwo=FALSE)[[1]] )[[1]]
g <- jone
Fh<- (Fh + Ah + K[jone] + thisM[g]) %%two32 # truncate
Ah<- Dh
Dh<- Ch
Ch<- Bh
Bh<- (Bh + bigRotate(Fh, sidx[jone], inTwo=FALSE)[[1]] )%%two32
}
for (j2 in 17:32) {
Fh <- bigOr( bigAnd(Dh,Bh,inTwo=FALSE)[[1]] ,bigAnd(bigNot(Dh[[1]],inTwo=FALSE, outTwo=FALSE), Ch, inTwo=FALSE)[[1]] ,inTwo=FALSE)[[1]]
g <-(1 + 5*(j2-1))%%16 +1 # plus one due to indexing from 1
Fh<- (Fh + Ah + K[j2] + thisM[g])%%two32
Ah<- Dh
Dh<- Ch
Ch<- Bh
Bh<- (Bh + bigRotate(Fh, sidx[j2], inTwo=FALSE)[[1]])%%two32
}
for(j3 in 33:48){
Fh <- bigXor(Bh,bigXor(Ch,Dh, inTwo=FALSE)[[1]], inTwo=FALSE)[[1]]
g <-(5 + 3*(j3-1))%%16 +1
Fh<- (Fh + Ah + K[j3] + thisM[g])%%two32
Ah<- Dh
Dh<- Ch
Ch<- Bh
Bh<- (Bh + bigRotate(Fh, sidx[j3],inTwo=FALSE)[[1]])%%two32
}
for(j4 in 49:64){
Fh <- bigXor(Ch,bigOr(Bh,bigNot(Dh,inTwo=FALSE, outTwo=FALSE)[[1]],inTwo=FALSE)[[1]],inTwo=FALSE)[[1]]
g <- (7 * (j4 -1))%%16 +1
Fh<- (Fh + Ah + K[j4] + thisM[g])%%two32
Ah<- Dh
Dh<- Ch
Ch<- Bh
Bh<- (Bh + bigRotate(Fh, sidx[j4], inTwo=FALSE)[[1]])%%two32
}
# Add this chunk's hash to result so far:
a0<- (a0 + Ah)%%two32
b0<- (b0 + Bh)%%two32
c0<- (c0 + Ch)%%two32
d0<- (d0 + Dh)%%two32
} # end of jchunk
#finally, string the values together.
# watch for those leading zeros
a0x <-(base2base(a0,10,16))
a0x <- paste0(c(rep(0,times=max(8-(nchar(a0x)),0)), a0x), collapse='')
b0x <-(base2base(b0,10,16))
b0x <- paste0(c(rep(0,times=max(8-(nchar(b0x)),0)), b0x), collapse='')
c0x <-(base2base(c0,10,16))
c0x <- paste0(c(rep(0,times=max(8-(nchar(c0x)),0)), c0x), collapse='')
d0x <-(base2base(d0,10,16))
d0x <- paste0(c(rep(0,times=max(8-(nchar(d0x)),0)), d0x), collapse='')
thesum <- paste0(c(a0x,b0x,c0x,d0x) , sep='',collapse='')
return(thesum)
}
# helper func to test reordering bytes. e.g. 01234567 from 67452301
flip32 <- function(x){
xsep <- unlist(strsplit(x,'') )
xout <- paste0(c(xsep[c(7,8,5,6,3,4,1,2)]),collapse='')
return(xout)
}
flip16 <- function(x) {
xsep <- unlist(strsplit(x,''))
xout <- paste0(c(xsep[c(3,4,1,2)]),collapse='')
return(xout)
}
bigRotate <- function(x, shift, inBase = 10,binSize = 32, outBase = 10, inTwosComp = TRUE) {
#default binary size is 32 to match bitwShiftR
shift = floor(shift[1])
out <- rep('0',times = length(x)) # gmp::as.bigz(rep(0,times=length(x)))
for(jj in 1:length(x)) {
bintmp <- base2base(x[[jj]],inBase,2, binSize = binSize, inTwosComp = inTwosComp, outTwosComp = FALSE)[[1]]
#now all inputs will have a neg sign if negative.
isPos = TRUE
if(length(grep('-',bintmp)) ) {
isPos = FALSE
bintmp <- gsub('^-','',bintmp)
}
bintmp <- strsplit(bintmp,'')[[1]]
xlen = length(bintmp)
# now rotate the bits.
otmp <- unlist(paste0(c(bintmp[ ((1:xlen) + shift -1) %%(xlen) + 1]),collapse=''))
out[jj] <- base2base(otmp, 2, outBase, binSize=binSize, outTwosComp = FALSE, classOut = "character")[[1]]
if(!isPos) out[jj] <- paste0('-',out[jj], collapse='')
}
# provide output in source 'inBase'
if(is(x,'mpfr') || is(x,'mpfr1')) {
out <- Rmpfr::.bigz2mpfr(gmp::as.bigz(out))
} else if(is(x,'numeric')) {
out <- as.numeric(out)
} else if(is(x,'bigz')) out <- gmp::as.bigz(out)
return(out)
}
There are a few issues which are listed below.
flip32
It has been rewritten as
msg <- c(msg, as.raw('0x80') )
while (length(msg) %% 64 != 56) {
msg <- c(msg, as.raw(0))
}
thischunk
matrix needed to be reversed.thischunk <- thischunk[nrow(thischunk):1,]
bigRotate
This can be expressed as ((x<<amount) | (x>>(32-amount))) & 0xFFFFFFFF
so it has been rewritten as
bigRotate <- function(x, amount) {
two32 <- as.bigz(2^(as.bigz(32)))
x <- x %% two32
lshift <- bigShiftL(x, amount)
rshift <- bigShiftR(x, 32-amount)
# bigOr errors if either value is zero so handle those cases explicitly
if (lshift == 0) {
rshift
} else if (rshift == 0) {
lshift
} else {
bigOr(lshift, rshift) %% two32
}
}
a
, b
, c
, and d
was rewritten and the endianness was swapped.swap_endianness <- function(hex) {
chars <- strsplit(hex, "")[[1]]
pairs <- paste0(chars[c(TRUE, FALSE)], chars[c(FALSE, TRUE)])
paste0(rev(pairs), collapse = "")
}
thesum <- sum(
bigShiftL(a0, 32 * 0),
bigShiftL(b0, 32 * 1),
bigShiftL(c0, 32 * 2),
bigShiftL(d0, 32 * 3)
)
hex <- base2base(thesum, frombase = 10, tobase = 16)[[1]]
output <- swap_endianness(hex)
Implementing the above changes yields
library(bigBits)
library(gmp)
mymd5 <- function(msg){
if(!is(msg,'raw')) msg <- charToRaw(as.character(msg))
# a const to be used when truncating values > ffffffff
two32 <- as.bigz(2^(as.bigz(32)))
sidx <- c(7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22 , 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20 , 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23 , 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21)
K <- as.bigz(2^32 * abs(sin(1:64)))
adef <- '67452301'
bdef <- 'efcdab89'
cdef <- '98badcfe'
ddef <- '10325476'
a0 <- base2base(adef, 16, 10)[[1]]
b0 <- base2base(bdef, 16, 10)[[1]]
c0 <- base2base(cdef, 16, 10)[[1]]
d0 <- base2base(ddef, 16, 10)[[1]]
origlenbit <- length(msg) * 8
msg <- c(msg, as.raw('0x80') )
while (length(msg) %% 64 != 56) {
msg <- c(msg, as.raw(0))
}
length_bytes <- writeBin(as.integer(origlenbit), raw(), size = 8, endian = "little")
msg <- c(msg, length_bytes, rep(0, length.out = 8 - length(length_bytes)))
chunkit <- as.bigz(as.numeric(msg ) ) # all these will be <= 0xff
## turn groups of 512 BITS of msg into 16 words of 32 bits each.
for(jchunk in 1: (length(msg)/64)) {
thischunk <- matrix(chunkit[ ((jchunk-1)*64)+(1:64)],ncol=16)
thischunk <- thischunk[nrow(thischunk):1,]
thisM <- as.bigz(rep(0,16) )
for(jm in 1:16) thisM[jm] <- sum(thischunk[,jm] * c(256^3,256^2,256,1))
# Initialize hash value for this chunk:
Ah <- a0
Bh <- b0
Ch <- c0
Dh <- d0
#now the chunk loop, which I make into 4 sep'te loops
for (jone in 1:16) {
Fh <- bigOr (bigAnd(Bh,Ch, inTwo=FALSE)[[1]] ,bigAnd(bigNot(Bh, inTwo=FALSE, outTwo=FALSE)[[1]],Dh, inTwo=FALSE)[[1]] )[[1]]
g <- jone
Fh <- (Fh + Ah + K[jone] + thisM[g]) %% two32
Ah <- Dh
Dh <- Ch
Ch <- Bh
Bh <- (Bh + bigRotate(Fh, sidx[jone])) %% two32
}
for (j2 in 17:32) {
Fh <- bigOr( bigAnd(Dh,Bh,inTwo=FALSE)[[1]] ,bigAnd(bigNot(Dh[[1]],inTwo=FALSE, outTwo=FALSE), Ch, inTwo=FALSE)[[1]] ,inTwo=FALSE)[[1]]
g <-(1 + 5*(j2-1))%%16 +1 # plus one due to indexing from 1
Fh <- (Fh + Ah + K[j2] + thisM[g]) %% two32
Ah <- Dh
Dh <- Ch
Ch <- Bh
Bh <- (Bh + bigRotate(Fh, sidx[j2])) %% two32
}
for(j3 in 33:48){
Fh <- bigXor(Bh,bigXor(Ch,Dh, inTwo=FALSE)[[1]], inTwo=FALSE)[[1]]
g <-(5 + 3*(j3-1))%%16 +1
Fh<- (Fh + Ah + K[j3] + thisM[g]) %% two32
Ah<- Dh
Dh<- Ch
Ch<- Bh
Bh<- (Bh + bigRotate(Fh, sidx[j3]))%%two32
}
for(j4 in 49:64){
Fh <- bigXor(Ch,bigOr(Bh,bigNot(Dh,inTwo=FALSE, outTwo=FALSE)[[1]],inTwo=FALSE)[[1]],inTwo=FALSE)[[1]]
g <- (7 * (j4 -1))%%16 +1
Fh<- (Fh + Ah + K[j4] + thisM[g]) %% two32
Ah<- Dh
Dh<- Ch
Ch<- Bh
Bh<- (Bh + bigRotate(Fh, sidx[j4])) %% two32
}
# Add this chunk's hash to result so far:
a0 <- (a0 + Ah) %% two32
b0 <- (b0 + Bh) %% two32
c0 <- (c0 + Ch) %% two32
d0 <- (d0 + Dh) %% two32
} # end of jchunk
thesum <- sum(
bigShiftL(a0, 32 * 0),
bigShiftL(b0, 32 * 1),
bigShiftL(c0, 32 * 2),
bigShiftL(d0, 32 * 3)
)
hex <- base2base(thesum, frombase = 10, tobase = 16)[[1]]
swap_endianness(hex)
}
# ((x<<amount) | (x>>(32-amount))) & 0xFFFFFFFF
bigRotate <- function(x, amount) {
two32 <- as.bigz(2^(as.bigz(32)))
x <- x %% two32
lshift <- bigShiftL(x, amount)
rshift <- bigShiftR(x, 32-amount)
# bigOr errors if either value is zero so handle those cases explicitly
if (lshift == 0) {
rshift
} else if (rshift == 0) {
lshift
} else {
bigOr(lshift, rshift) %% two32
}
}
swap_endianness <- function(hex) {
chars <- strsplit(hex, "")[[1]]
pairs <- paste0(chars[c(TRUE, FALSE)], chars[c(FALSE, TRUE)])
paste0(rev(pairs), collapse = "")
}
Which passes a handful of unit tests
library(testthat)
test_that("md5 works", {
expect_equal(mymd5(""), "d41d8cd98f00b204e9800998ecf8427e")
expect_equal(mymd5("a"), "0cc175b9c0f1b6a831c399e269772661")
expect_equal(mymd5("abc"), "900150983cd24fb0d6963f7d28e17f72")
expect_equal(mymd5("message digest"), "f96b697d7cb7938d525a2f31aaf161d0")
expect_equal(mymd5("abcdefghijklmnopqrstuvwxyz"), "c3fcd3d76192e4007dfb496cca67e13b")
expect_equal(mymd5("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), "d174ab98d277d9f5a5611c2c9f419d9f")
expect_equal(mymd5("12345678901234567890123456789012345678901234567890123456789012345678901234567890"), "57edf4a22be3c955ac49da2e2107b67a")
})
#> Test passed