rmd5endiannessmd5sum

md5 hash calculation function failing


I'm trying to write an md5 hash function in R without calling any C-routines. While my code executes just fine, the output never matches tools::md5sum (which does match examples provided in various online documents). I suspect a byte-order (or word-order) problem somewhere; as can be seen in the code provided below I have tried inserting some flipflops but without success. I have checked for simple mismatches in the output, such as the correct bytes but in the wrong order, but no luck.

To run this function, you'll need the libraries Rmpfr and bigBits as well as the functions provided here (flip32, flip16, and bigRotate ).

# need these to run:
library(Rmpfr)
library(bigBits)

mymd5 <- function(msg){

  if(!is(msg,'raw')) msg <- charToRaw(as.character(msg))
# a const to be used when truncating values > ffffffff
two32 <- as.bigz(2^(as.bigz(32))) 

sidx <- c( 7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22 , 5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20 , 4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23 , 6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21 )

#  K-values verified to be correct.
 K <- as.bigz(2^32 * abs(sin(1:64))) 

# IETF  shows these values as-is, i.e. don't change Endian on these calculations
#ietf.org page says A is 01234567, ie lower bytes first.  

adef <- '67452301'
bdef <- 'efcdab89'
cdef <- '98badcfe'
ddef <- '10325476'

# then optionally
adef <- flip32(adef)
bdef <- flip32(bdef)
cdef <- flip32(cdef)
ddef <- flip32(ddef)

a0 <- (base2base(adef,16,10))[[1]]
b0 <-(base2base(bdef,16,10))[[1]]
c0 <-(base2base(cdef,16,10))[[1]]
d0 <-(base2base(ddef,16,10))[[1]]

#  append 0x80  and pad with 0x00 bytes so that the message length in bytes ≡ 56 (mod 64).
  origlenbit <- length(msg) * 8
  msg <- c(msg, as.raw('0x80') )
  bytemod <- length(msg)%%64
  raw00 <- as.raw('0x00')
#  append  '0x00' 56-bytemod times
    if (bytemod <  56 && bytemod > 0 ) {
        msg <- c(msg,rep(raw00,times=(56-bytemod)))
    } else if (bytemod > 56) {
        msg <- c(msg, rep(raw00, times = bytemod-56))
    }
    
 # https://datatracker.ietf.org/doc/html/rfc1321#section-3.4 says
 #  A 64-bit representation of b (the length of the message before the
   # padding bits were added) is appended to the result of the previous
   # step. In the unlikely event that b is greater than 2^64, then only
   # the low-order 64 bits of b are used. (These bits are appended as two
   # 32-bit words and appended low-order word first in accordance with the
   # previous conventions.)
    len16 <-    base2base(origlenbit,10,16)[[1]]
    lenbit2 <- base2base(origlenbit,10,2)[[1]]
    if(nchar(lenbit2) < 64 ){
        lenbit2 <- paste0(c(rep('0',times = 64 -nchar(lenbit2)) ,lenbit2), collapse='')
    } else if (nchar(lenbit2) > 64) lenbit2 <- rev(rev(lenbit2)[1:64])
    newbytes2 <- substring(lenbit2, seq(1,nchar(lenbit2)-7,by=8), last = seq(8, nchar(lenbit2),by = 8) )    
    newdec <- base2base(newbytes2,2,10)
    newdbl <- rep(0,times= 8)
    for (jd in 1: 8) newdbl[jd] <- as.numeric(newdec[[jd]])
    newraw <- as.raw(newdbl)
#  which order is correct? 
#   msg <- c(msg,newraw[c(7,8,5,6,3,4,1,2)])
    msg <- c(msg, newraw[5:8], newraw[1:4])
#   msg <- c(msg, newraw[1:4], newraw[5:8])
    chunkit <- as.bigz(as.numeric(msg ) ) # all these will be <= 0xff
##  turn groups of 512 BITS of msg into 16 words of 32 bits each.
    for(jchunk in 1: (length(msg)/64)) {  
        thischunk <- matrix(chunkit[ ((jchunk-1)*64)+(1:64)],ncol=16)
#  should I reverse the bytes of this chunk? 
        thisM <- as.bigz(rep(0,16) )
        for(jm in 1:16) thisM[jm] <- sum(thischunk[,jm] * c(256^3,256^2,256,1) )
#       for(jm in 1:16) thisM[jm] <- sum(thischunk[,jm] * c(1,256,256^2,256^3) )
      # Initialize hash value for this chunk:
         Ah<- a0
         Bh<- b0
         Ch<- c0
         Dh<- d0
#now the chunk loop, which I make into 4 sep'te loops 
        for (jone in 1:16) {
            Fh <- bigOr (bigAnd(Bh,Ch, inTwo=FALSE)[[1]] ,bigAnd(bigNot(Bh, inTwo=FALSE, outTwo=FALSE)[[1]],Dh, inTwo=FALSE)[[1]] )[[1]]
            g <- jone
            Fh<- (Fh + Ah + K[jone] + thisM[g]) %%two32  # truncate
            Ah<- Dh
            Dh<- Ch
            Ch<- Bh
            Bh<- (Bh + bigRotate(Fh, sidx[jone], inTwo=FALSE)[[1]] )%%two32   
        }

        for (j2 in 17:32) {
            Fh <- bigOr( bigAnd(Dh,Bh,inTwo=FALSE)[[1]] ,bigAnd(bigNot(Dh[[1]],inTwo=FALSE, outTwo=FALSE), Ch, inTwo=FALSE)[[1]] ,inTwo=FALSE)[[1]] 
            g <-(1 + 5*(j2-1))%%16 +1 # plus one due to indexing from 1
            Fh<- (Fh + Ah + K[j2] + thisM[g])%%two32  
            Ah<- Dh
            Dh<- Ch
            Ch<- Bh
            Bh<- (Bh + bigRotate(Fh, sidx[j2], inTwo=FALSE)[[1]])%%two32   
        }
        for(j3 in 33:48){
            Fh <- bigXor(Bh,bigXor(Ch,Dh, inTwo=FALSE)[[1]], inTwo=FALSE)[[1]]
            g <-(5 + 3*(j3-1))%%16 +1
            Fh<- (Fh + Ah + K[j3] + thisM[g])%%two32  
            Ah<- Dh
            Dh<- Ch
            Ch<- Bh
            Bh<- (Bh + bigRotate(Fh, sidx[j3],inTwo=FALSE)[[1]])%%two32 
        }
        for(j4 in 49:64){
            Fh <- bigXor(Ch,bigOr(Bh,bigNot(Dh,inTwo=FALSE, outTwo=FALSE)[[1]],inTwo=FALSE)[[1]],inTwo=FALSE)[[1]]
            g <- (7 * (j4 -1))%%16 +1
            Fh<- (Fh + Ah + K[j4] + thisM[g])%%two32  
            Ah<- Dh
            Dh<- Ch
            Ch<- Bh
            Bh<- (Bh + bigRotate(Fh, sidx[j4], inTwo=FALSE)[[1]])%%two32 
        }
           # Add this chunk's hash to result so far:
        a0<- (a0 + Ah)%%two32
        b0<- (b0 + Bh)%%two32
        c0<- (c0 + Ch)%%two32
        d0<- (d0 + Dh)%%two32
    }   # end of jchunk

#finally, string the values together.
# watch for those leading zeros
a0x <-(base2base(a0,10,16))
a0x <- paste0(c(rep(0,times=max(8-(nchar(a0x)),0)), a0x), collapse='')
b0x <-(base2base(b0,10,16))
b0x <- paste0(c(rep(0,times=max(8-(nchar(b0x)),0)), b0x), collapse='')
c0x <-(base2base(c0,10,16))
c0x <- paste0(c(rep(0,times=max(8-(nchar(c0x)),0)), c0x), collapse='')
d0x <-(base2base(d0,10,16))
d0x <- paste0(c(rep(0,times=max(8-(nchar(d0x)),0)), d0x), collapse='')


thesum <- paste0(c(a0x,b0x,c0x,d0x) , sep='',collapse='')
return(thesum)
} 

# helper func to test reordering bytes. e.g.  01234567 from 67452301

flip32 <- function(x){
    xsep <- unlist(strsplit(x,'') )
    xout <- paste0(c(xsep[c(7,8,5,6,3,4,1,2)]),collapse='')
    return(xout)
}
flip16 <- function(x) {
        xsep <- unlist(strsplit(x,''))
    xout <- paste0(c(xsep[c(3,4,1,2)]),collapse='')
    return(xout)
}


bigRotate <- function(x, shift,  inBase = 10,binSize = 32, outBase = 10, inTwosComp = TRUE) {
#default binary size is 32 to match bitwShiftR  
 shift = floor(shift[1])

 out <- rep('0',times = length(x))  # gmp::as.bigz(rep(0,times=length(x)))
    for(jj in 1:length(x)) {
        bintmp <- base2base(x[[jj]],inBase,2, binSize = binSize, inTwosComp = inTwosComp, outTwosComp = FALSE)[[1]]
#now all inputs will have a neg sign if negative.
        isPos = TRUE
        if(length(grep('-',bintmp)) ) {
            isPos = FALSE
            bintmp <- gsub('^-','',bintmp)
        }
        bintmp <- strsplit(bintmp,'')[[1]]
        xlen = length(bintmp)
    # now rotate the bits.  
        otmp  <- unlist(paste0(c(bintmp[ ((1:xlen) + shift -1) %%(xlen) + 1]),collapse=''))
        out[jj] <- base2base(otmp, 2, outBase, binSize=binSize, outTwosComp = FALSE, classOut = "character")[[1]]
        if(!isPos) out[jj] <- paste0('-',out[jj], collapse='')
    }
    # provide output in source 'inBase'
    if(is(x,'mpfr') || is(x,'mpfr1')) {
        out <- Rmpfr::.bigz2mpfr(gmp::as.bigz(out))
    } else if(is(x,'numeric')) {
        out <- as.numeric(out)
    } else if(is(x,'bigz')) out <- gmp::as.bigz(out)
    
 return(out) 
}

Solution

  • There are a few issues which are listed below.

    1. Removed flip32

    2. Issue with appending the padding bits

    It has been rewritten as

    msg <- c(msg, as.raw('0x80') )
    while (length(msg) %% 64 != 56) {
      msg <- c(msg, as.raw(0))
    }
    

    3. The rows of the thischunk matrix needed to be reversed.

    thischunk <- thischunk[nrow(thischunk):1,]
    

    4. Issue with bigRotate

    This can be expressed as ((x<<amount) | (x>>(32-amount))) & 0xFFFFFFFF so it has been rewritten as

    bigRotate <- function(x, amount) {
      two32 <- as.bigz(2^(as.bigz(32)))
      x <- x %% two32
      lshift <- bigShiftL(x, amount)
      rshift <- bigShiftR(x, 32-amount)
      
      # bigOr errors if either value is zero so handle those cases explicitly
      if (lshift == 0) {
        rshift
      } else if (rshift == 0) {
        lshift
      } else {
        bigOr(lshift, rshift) %% two32
      }
    }
    

    5. Combining a, b, c, and d was rewritten and the endianness was swapped.

    swap_endianness <- function(hex) {
      chars <- strsplit(hex, "")[[1]]
      pairs <- paste0(chars[c(TRUE, FALSE)], chars[c(FALSE, TRUE)])
      paste0(rev(pairs), collapse = "")
    }
    
    thesum <- sum(
        bigShiftL(a0, 32 * 0),
        bigShiftL(b0, 32 * 1),
        bigShiftL(c0, 32 * 2),
        bigShiftL(d0, 32 * 3)
    )
    hex <- base2base(thesum, frombase = 10, tobase = 16)[[1]]
    output <- swap_endianness(hex)
    

    Corrected code

    Implementing the above changes yields

    library(bigBits)
    library(gmp)
    
    
    mymd5 <- function(msg){
      if(!is(msg,'raw')) msg <- charToRaw(as.character(msg))
      
      # a const to be used when truncating values > ffffffff
      two32 <- as.bigz(2^(as.bigz(32)))
      sidx <- c(7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22 , 5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20 , 4, 11, 16, 23,  4, 11, 16, 23, 4, 11, 16, 23,  4, 11, 16, 23 , 6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21)
      K <- as.bigz(2^32 * abs(sin(1:64))) 
      
      adef <- '67452301'
      bdef <- 'efcdab89'
      cdef <- '98badcfe'
      ddef <- '10325476'
      
      a0 <- base2base(adef, 16, 10)[[1]]
      b0 <- base2base(bdef, 16, 10)[[1]]
      c0 <- base2base(cdef, 16, 10)[[1]]
      d0 <- base2base(ddef, 16, 10)[[1]]
    
      origlenbit <- length(msg) * 8
      
      msg <- c(msg, as.raw('0x80') )
      while (length(msg) %% 64 != 56) {
        msg <- c(msg, as.raw(0))
      }
    
      length_bytes <- writeBin(as.integer(origlenbit), raw(), size = 8, endian = "little")
      msg <- c(msg, length_bytes, rep(0, length.out = 8 - length(length_bytes)))
      
      chunkit <- as.bigz(as.numeric(msg ) ) # all these will be <= 0xff
      
      ##  turn groups of 512 BITS of msg into 16 words of 32 bits each.
      for(jchunk in 1: (length(msg)/64)) {  
        thischunk <- matrix(chunkit[ ((jchunk-1)*64)+(1:64)],ncol=16)
        thischunk <- thischunk[nrow(thischunk):1,]
        thisM <- as.bigz(rep(0,16) )
        for(jm in 1:16) thisM[jm] <- sum(thischunk[,jm] * c(256^3,256^2,256,1))
    
        # Initialize hash value for this chunk:
        Ah <- a0
        Bh <- b0
        Ch <- c0
        Dh <- d0
        
        #now the chunk loop, which I make into 4 sep'te loops 
        for (jone in 1:16) {
          Fh <- bigOr (bigAnd(Bh,Ch, inTwo=FALSE)[[1]] ,bigAnd(bigNot(Bh, inTwo=FALSE, outTwo=FALSE)[[1]],Dh, inTwo=FALSE)[[1]] )[[1]]
          g <- jone
          Fh <- (Fh + Ah + K[jone] + thisM[g]) %% two32
          Ah <- Dh
          Dh <- Ch
          Ch <- Bh
          Bh <- (Bh + bigRotate(Fh, sidx[jone])) %% two32   
        }
        
        for (j2 in 17:32) {
          Fh <- bigOr( bigAnd(Dh,Bh,inTwo=FALSE)[[1]] ,bigAnd(bigNot(Dh[[1]],inTwo=FALSE, outTwo=FALSE), Ch, inTwo=FALSE)[[1]] ,inTwo=FALSE)[[1]] 
          g <-(1 + 5*(j2-1))%%16 +1 # plus one due to indexing from 1
          Fh <- (Fh + Ah + K[j2] + thisM[g]) %% two32  
          Ah <- Dh
          Dh <- Ch
          Ch <- Bh
          Bh <- (Bh + bigRotate(Fh, sidx[j2])) %% two32
        }
        for(j3 in 33:48){
          Fh <- bigXor(Bh,bigXor(Ch,Dh, inTwo=FALSE)[[1]], inTwo=FALSE)[[1]]
          g <-(5 + 3*(j3-1))%%16 +1
          Fh<- (Fh + Ah + K[j3] + thisM[g]) %% two32  
          Ah<- Dh
          Dh<- Ch
          Ch<- Bh
          Bh<- (Bh + bigRotate(Fh, sidx[j3]))%%two32 
        }
        for(j4 in 49:64){
          Fh <- bigXor(Ch,bigOr(Bh,bigNot(Dh,inTwo=FALSE, outTwo=FALSE)[[1]],inTwo=FALSE)[[1]],inTwo=FALSE)[[1]]
          g <- (7 * (j4 -1))%%16 +1
          Fh<- (Fh + Ah + K[j4] + thisM[g]) %% two32  
          Ah<- Dh
          Dh<- Ch
          Ch<- Bh
          Bh<- (Bh + bigRotate(Fh, sidx[j4])) %% two32 
        }
        
        # Add this chunk's hash to result so far:
        a0 <- (a0 + Ah) %% two32
        b0 <- (b0 + Bh) %% two32
        c0 <- (c0 + Ch) %% two32
        d0 <- (d0 + Dh) %% two32
      }   # end of jchunk
    
      thesum <- sum(
        bigShiftL(a0, 32 * 0),
        bigShiftL(b0, 32 * 1),
        bigShiftL(c0, 32 * 2),
        bigShiftL(d0, 32 * 3)
      )
      
      hex <- base2base(thesum, frombase = 10, tobase = 16)[[1]]
      swap_endianness(hex)
    }
    
    
    # ((x<<amount) | (x>>(32-amount))) & 0xFFFFFFFF
    bigRotate <- function(x, amount) {
      two32 <- as.bigz(2^(as.bigz(32)))
      x <- x %% two32
      lshift <- bigShiftL(x, amount)
      rshift <- bigShiftR(x, 32-amount)
      
      # bigOr errors if either value is zero so handle those cases explicitly
      if (lshift == 0) {
        rshift
      } else if (rshift == 0) {
        lshift
      } else {
        bigOr(lshift, rshift) %% two32
      }
    }
    
    
    swap_endianness <- function(hex) {
      chars <- strsplit(hex, "")[[1]]
      pairs <- paste0(chars[c(TRUE, FALSE)], chars[c(FALSE, TRUE)])
      paste0(rev(pairs), collapse = "")
    }
    

    Which passes a handful of unit tests

    library(testthat)
    
    test_that("md5 works", {
      expect_equal(mymd5(""), "d41d8cd98f00b204e9800998ecf8427e")
      expect_equal(mymd5("a"), "0cc175b9c0f1b6a831c399e269772661")
      expect_equal(mymd5("abc"), "900150983cd24fb0d6963f7d28e17f72")
      expect_equal(mymd5("message digest"), "f96b697d7cb7938d525a2f31aaf161d0")
      expect_equal(mymd5("abcdefghijklmnopqrstuvwxyz"), "c3fcd3d76192e4007dfb496cca67e13b")
      expect_equal(mymd5("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), "d174ab98d277d9f5a5611c2c9f419d9f")
      expect_equal(mymd5("12345678901234567890123456789012345678901234567890123456789012345678901234567890"), "57edf4a22be3c955ac49da2e2107b67a")
    })
    #> Test passed