wlynxg · wlynxg · Mar 3, 2026
diff --git a/README.md b/README.md
@@ -1,173 +1,38 @@
-<div align=center>
-
-## chardet: Go character encoding detector
-[![Go Reference](https://pkg.go.dev/badge/github.com/wlynxg/chardet.svg)](https://pkg.go.dev/github.com/wlynxg/chardet)
-[![License](https://img.shields.io/github/license/wlynxg/chardet.svg?style=flat)](https://github.com/wlynxg/chardet)
-[![Go Report Card](https://goreportcard.com/badge/github.com/wlynxg/chardet)](https://goreportcard.com/report/github.com/wlynxg/chardet)
-
-</div>
-
-# Introduction
-
-This is a Go port of the python's [chardet](https://github.com/chardet/chardet) library. Much respect and appreciation to the original authors for their excellent work.
-
-chardet is a character encoding detector library written in Go. It helps you automatically detect the character encoding of text content.
-
-# Installation
-
-To install chardet, use `go get`:
-
-```bash
-go get github.com/wlynxg/chardet
-```
-
-## Supported Encodings & Languages
-
-**Support Encodings**:
-
-<details>
-  <summary>Expand the list of supported encodings</summary>
-
-- **Ascii**
-- **UTF-8**
-- **UTF-8-SIG**
-- **UTF-16**
-- **UTF-16LE**
-- **UTF-16BE**
-- **UTF-32**
-- **UTF-32BE**
-- **UTF-32LE**
-- **GB2312**
-- **HZ-GB-2312**
-- **SHIFT_JIS**
-- **Big5**
-- **Johab**
-- **KOI8-R**
-- **TIS-620**
-- **MacCyrillic**
-- **MacRoman**
-- **EUC-TW**
-- **EUC-KR**
-- **EUC-JP**
-- **CP932**
-- **CP949**
-- **Windows-1250**
-- **Windows-1251**
-- **Windows-1252**
-- **Windows-1253**
-- **Windows-1254**
-- **Windows-1255**
-- **Windows-1256**
-- **Windows-1257**
-- **ISO-8859-1**
-- **ISO-8859-2**
-- **ISO-8859-5**
-- **ISO-8859-6**
-- **ISO-8859-7**
-- **ISO-8859-8**
-- **ISO-8859-9**
-- **ISO-8859-13**
-- **ISO-2022-CN**
-- **ISO-2022-JP**
-- **ISO-2022-KR**
-- **X-ISO-10646-UCS-4-3412**
-- **X-ISO-10646-UCS-4-2143**
-- **IBM855**
-- **IBM866**
-
-</details>
-
-**Support Languages**:
-<details>
-<summary>Expand the list of supported languages</summary>
-- Chinese
-- Japanese
-- Korean
-- Hebrew
-- Russian
-- Greek
-- Bulgarian
-- Thai
-- Turkish
-
-</details>
-
-# Usage
-
-## Basic Usage
-
-The simplest way to use chardet is with the `Detect` function:
-
-```go
-package main
-
-import (
-	"fmt"
-	"github.com/wlynxg/chardet"
-)
-
-func main() {
-	data := []byte("Your text data here...")
-	result := chardet.Detect(data)
-	fmt.Printf("Detected result: %+v\n", result) 
-    //Output: Detected result: {Encoding:Ascii Confidence:1 Language:}
-}
-```
-
-## Advanced Usage
-
-For handling large amounts of text, you can use the detector incrementally. This allows the detector to stop as soon as it reaches sufficient confidence in its result.
-```go
-package main
-
-import (
-	"fmt"
-	"github.com/wlynxg/chardet"
-)
-
-func main() {
-	// Create a detector instance
-	detector := chardet.NewUniversalDetector(0)
-	// Process text in chunks
-	chunk1 := []byte("First chunk of text...")
-	chunk2 := []byte("Second chunk of text...")
-	detector.Feed(chunk1)
-	detector.Feed(chunk2)
-	// Get the result
-	result := detector.GetResult()
-	fmt.Printf("Detected result: %+v\n", result)
-	// Output: Detected result: {Encoding:Ascii Confidence:1 Language:}
-}
-```
-
-## Processing Multiple Files
-
-You can reuse the same detector instance for multiple files by using the `Reset()` method:
-```go
-package main
-
-import (
-	"fmt"
-	"os"
-	"github.com/wlynxg/chardet"
-)
-
-func main() {
-	detector := chardet.NewUniversalDetector(0)
-	files := []string{"file1.txt", "file2.txt"}
-	for _, file := range files {
-		detector.Reset()
-		data, err := os.ReadFile(file)
-		if err != nil {
-			continue
-		}
-		detector.Feed(data)
-		result := detector.GetResult()
-		fmt.Printf("File %s encoding: %+v\n", file, result)
-	}
-}
-```
-
-# License
-
-`chardet` is licensed under the [MIT License](LICENSE), 100% free and open-source, forever.
++`Result.Encoding` continues to expose the legacy value (e.g. `Ascii`, `SHIFT_JIS`). For new applications use `Result.Charset`, which follows IANA naming.
++
++## Decoding text
++
++Use the optional `github.com/wlynxg/chardet/lookup` helper to map `Result.Charset` to `golang.org/x/text/encoding`:
++
++```go
++package main
++
++import (
++	"fmt"
++
++	"github.com/wlynxg/chardet"
++	"github.com/wlynxg/chardet/lookup"
++)
++
++func main() {
++	data := []byte("Your text data here...")
++	result := chardet.Detect(data)
++
++	enc, err := lookup.LookupEncoding(result.Charset)
++	if err != nil {
++		panic(err)
++	}
++	if enc == nil {
++		fmt.Printf("no decoder for %s\n", result.Charset)
++		return
++	}
++
++	decoded, err := enc.NewDecoder().String(string(data))
++	if err != nil {
++		panic(err)
++	}
++
++	fmt.Println(decoded)
++}
++
+*** End Patch
diff --git a/chardet.go b/chardet.go
@@ -44,11 +44,7 @@ func DetectAll(buf []byte) []Result {
 						charsetName = n
 					}
 				}
-				results = append(results, Result{
-					Encoding:   charsetName,
-					Confidence: setProbe.GetConfidence(),
-					Language:   setProbe.Language(),
-				})
+				results = append(results, newResult(charsetName, setProbe.GetConfidence(), setProbe.Language()))
 			}
 		}
 

diff --git a/consts/charset.go b/consts/charset.go
@@ -0,0 +1,34 @@
+package consts
+
+var legacyToCanonical = map[string]string{
+	Ascii:       "US-ASCII",
+	ShiftJis:    "Shift_JIS",
+	Johab:       "KS_C_5601-1987",
+	MacRoman:    "macintosh",
+	MacCyrillic: "x-mac-cyrillic",
+}
+
+var canonicalToLegacy map[string]string
+
+func init() {
+	canonicalToLegacy = make(map[string]string, len(legacyToCanonical))
+	for legacy, canonical := range legacyToCanonical {
+		canonicalToLegacy[canonical] = legacy
+	}
+}
+
+// CanonicalCharset returns the IANA-compliant charset name for the provided legacy encoding name.
+func CanonicalCharset(name string) string {
+	if canonical, ok := legacyToCanonical[name]; ok {
+		return canonical
+	}
+	return name
+}
+
+// LegacyCharset returns the legacy encoding name for the provided canonical charset.
+func LegacyCharset(name string) string {
+	if legacy, ok := canonicalToLegacy[name]; ok {
+		return legacy
+	}
+	return name
+}
diff --git a/detector.go b/detector.go
@@ -11,6 +11,8 @@ import (
 type Result struct {
 	// Encoding is the detected character encoding name
 	Encoding string `json:"encoding,omitempty"`
+	// Charset is the detected charset name using IANA-compliant naming
+	Charset string `json:"charset,omitempty"`
 	// Confidence indicates how confident the detector is about the result (0.0-1.0)
 	Confidence float64 `json:"confidence,omitempty"`
 	// Language represents the detected language (if applicable)
@@ -125,11 +127,7 @@ func (u *UniversalDetector) Feed(buf []byte) bool {
 		}
 		u.gotData = true
 		if encoding != "" {
-			u.result = Result{
-				Encoding:   encoding,
-				Confidence: 1.0,
-				Language:   "",
-			}
+			u.result = newResult(encoding, 1.0, "")
 			u.done = true
 			return false
 		}
@@ -155,11 +153,7 @@ func (u *UniversalDetector) Feed(buf []byte) bool {
 
 	if u.utf1632Probe.State() == consts.DetectingProbingState {
 		if u.utf1632Probe.Feed(buf) == consts.FoundItProbingState {
-			u.result = Result{
-				Encoding:   u.utf1632Probe.CharSetName(),
-				Confidence: u.utf1632Probe.GetConfidence(),
-				Language:   "",
-			}
+			u.result = newResult(u.utf1632Probe.CharSetName(), u.utf1632Probe.GetConfidence(), "")
 			u.done = true
 			return false
 		}
@@ -176,11 +170,7 @@ func (u *UniversalDetector) Feed(buf []byte) bool {
 		}
 
 		if u.escCharsetProbe.Feed(buf) == consts.FoundItProbingState {
-			u.result = Result{
-				Encoding:   u.escCharsetProbe.CharSetName(),
-				Confidence: u.escCharsetProbe.GetConfidence(),
-				Language:   u.escCharsetProbe.Language(),
-			}
+			u.result = newResult(u.escCharsetProbe.CharSetName(), u.escCharsetProbe.GetConfidence(), u.escCharsetProbe.Language())
 			u.done = true
 		}
 	case consts.HighByteInputState:
@@ -205,11 +195,7 @@ func (u *UniversalDetector) Feed(buf []byte) bool {
 			}
 
 			if charsetProbe.Feed(buf) == consts.FoundItProbingState {
-				u.result = Result{
-					Encoding:   charsetProbe.CharSetName(),
-					Confidence: charsetProbe.GetConfidence(),
-					Language:   charsetProbe.Language(),
-				}
+				u.result = newResult(charsetProbe.CharSetName(), charsetProbe.GetConfidence(), charsetProbe.Language())
 				u.done = true
 				break
 			}
@@ -234,11 +220,7 @@ func (u *UniversalDetector) GetResult() Result {
 	switch {
 	case !u.gotData:
 	case u.inputState == consts.PureAsciiInputState:
-		u.result = Result{
-			Encoding:   consts.Ascii,
-			Confidence: 1.0,
-			Language:   "",
-		}
+		u.result = newResult(consts.Ascii, 1.0, "")
 	case u.inputState == consts.HighByteInputState:
 		var (
 			confidence, maxProbeConfidence float64
@@ -268,11 +250,7 @@ func (u *UniversalDetector) GetResult() Result {
 					charsetName = n
 				}
 			}
-			u.result = Result{
-				Encoding:   charsetName,
-				Confidence: confidence,
-				Language:   maxConfidenceProbe.Language(),
-			}
+			u.result = newResult(charsetName, confidence, maxConfidenceProbe.Language())
 		}
 	}
 	return u.result

diff --git a/go.mod b/go.mod
@@ -1,3 +1,5 @@
 module github.com/wlynxg/chardet
 
-go 1.23.0
+go 1.21
+
+require golang.org/x/text v0.22.0
diff --git a/go.sum b/go.sum
@@ -0,0 +1,2 @@
+golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
+golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
		golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=