gocharacter-encodingfile-conversioniana

Converting Windows text files to utf-8 via IANA codes


I'd like to use chardet and golang.org/x/text to convert non-utf-8 files to utf-8. However, all the code examples that I've found so far require the user to hard-code the desired conversion direction. For example:

package main

import (
  "fmt"
  "io/ioutil"
  "os"
  "golang.org/x/text/encoding/charmap"
)

func main() {

  // Write the string
  // encoded to Windows-1252
  encoder := charmap.Windows1252.NewEncoder()
  s, e := encoder.String("This is sample text with runes Š")
  if e != nil {
    panic(e)
  }
  ioutil.WriteFile("example.txt", []byte(s), os.ModePerm)

  // Decode to UTF-8
  f, e := os.Open("example.txt")
  if e != nil {
    panic(e)
  }
  defer f.Close()
  decoder := charmap.Windows1252.NewDecoder()
  reader := decoder.Reader(f)
  b, err := ioutil.ReadAll(reader)
  if err != nil {
    panic(err)
  }
  fmt.Println(string(b))
}

How do I need to change the following line:

decoder := charmap.Windows1252.NewDecoder()

to accept an IANA code to select the required deocder? (chardet returns lower-case code page names, e.g. windows-1250, windows-1252 etc.)


Solution

  • The easiest solution here is also probably the simplest one: have a factory method which returns the appropriate decoder/encoder based on the IANA code. The only work you need to do is to map the IANA codes to the corresponding charmaps. You can find a list of all the charmaps here.

    var codeToCharmap map[string]*charmap.Charmap
    
    func init() {
        codeToCharmap = map[string]*charmap.Charmap{
            "windows-1250": charmap.Windows1250,
            "windows-1252": charmap.Windows1252,
            // ...
        }
    }
    
    func getDecoder(code string) *encoding.Decoder {
        if cm, ok := codeToCharmap[code]; ok {
            return cm.NewDecoder()
        }
    
        return nil
    }
    
    func getEncoder(code string) *encoding.Encoder {
        if cm, ok := codeToCharmap[code]; ok {
            return cm.NewEncoder()
        }
    
        return nil
    }