Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 38 additions & 173 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,173 +1,38 @@
<div align=center>

## chardet: Go character encoding detector
[![Go Reference](https://pkg.go.dev/badge/github.com/wlynxg/chardet.svg)](https://pkg.go.dev/github.com/wlynxg/chardet)
[![License](https://img.shields.io/github/license/wlynxg/chardet.svg?style=flat)](https://github.com/wlynxg/chardet)
[![Go Report Card](https://goreportcard.com/badge/github.com/wlynxg/chardet)](https://goreportcard.com/report/github.com/wlynxg/chardet)

</div>

# Introduction

This is a Go port of the python's [chardet](https://github.com/chardet/chardet) library. Much respect and appreciation to the original authors for their excellent work.

chardet is a character encoding detector library written in Go. It helps you automatically detect the character encoding of text content.

# Installation

To install chardet, use `go get`:

```bash
go get github.com/wlynxg/chardet
```

## Supported Encodings & Languages

**Support Encodings**:

<details>
<summary>Expand the list of supported encodings</summary>

- **Ascii**
- **UTF-8**
- **UTF-8-SIG**
- **UTF-16**
- **UTF-16LE**
- **UTF-16BE**
- **UTF-32**
- **UTF-32BE**
- **UTF-32LE**
- **GB2312**
- **HZ-GB-2312**
- **SHIFT_JIS**
- **Big5**
- **Johab**
- **KOI8-R**
- **TIS-620**
- **MacCyrillic**
- **MacRoman**
- **EUC-TW**
- **EUC-KR**
- **EUC-JP**
- **CP932**
- **CP949**
- **Windows-1250**
- **Windows-1251**
- **Windows-1252**
- **Windows-1253**
- **Windows-1254**
- **Windows-1255**
- **Windows-1256**
- **Windows-1257**
- **ISO-8859-1**
- **ISO-8859-2**
- **ISO-8859-5**
- **ISO-8859-6**
- **ISO-8859-7**
- **ISO-8859-8**
- **ISO-8859-9**
- **ISO-8859-13**
- **ISO-2022-CN**
- **ISO-2022-JP**
- **ISO-2022-KR**
- **X-ISO-10646-UCS-4-3412**
- **X-ISO-10646-UCS-4-2143**
- **IBM855**
- **IBM866**

</details>

**Support Languages**:
<details>
<summary>Expand the list of supported languages</summary>
- Chinese
- Japanese
- Korean
- Hebrew
- Russian
- Greek
- Bulgarian
- Thai
- Turkish

</details>

# Usage

## Basic Usage

The simplest way to use chardet is with the `Detect` function:

```go
package main

import (
"fmt"
"github.com/wlynxg/chardet"
)

func main() {
data := []byte("Your text data here...")
result := chardet.Detect(data)
fmt.Printf("Detected result: %+v\n", result)
//Output: Detected result: {Encoding:Ascii Confidence:1 Language:}
}
```

## Advanced Usage

For handling large amounts of text, you can use the detector incrementally. This allows the detector to stop as soon as it reaches sufficient confidence in its result.
```go
package main

import (
"fmt"
"github.com/wlynxg/chardet"
)

func main() {
// Create a detector instance
detector := chardet.NewUniversalDetector(0)
// Process text in chunks
chunk1 := []byte("First chunk of text...")
chunk2 := []byte("Second chunk of text...")
detector.Feed(chunk1)
detector.Feed(chunk2)
// Get the result
result := detector.GetResult()
fmt.Printf("Detected result: %+v\n", result)
// Output: Detected result: {Encoding:Ascii Confidence:1 Language:}
}
```

## Processing Multiple Files

You can reuse the same detector instance for multiple files by using the `Reset()` method:
```go
package main

import (
"fmt"
"os"
"github.com/wlynxg/chardet"
)

func main() {
detector := chardet.NewUniversalDetector(0)
files := []string{"file1.txt", "file2.txt"}
for _, file := range files {
detector.Reset()
data, err := os.ReadFile(file)
if err != nil {
continue
}
detector.Feed(data)
result := detector.GetResult()
fmt.Printf("File %s encoding: %+v\n", file, result)
}
}
```

# License

`chardet` is licensed under the [MIT License](LICENSE), 100% free and open-source, forever.
+`Result.Encoding` continues to expose the legacy value (e.g. `Ascii`, `SHIFT_JIS`). For new applications use `Result.Charset`, which follows IANA naming.
+
+## Decoding text
+
+Use the optional `github.com/wlynxg/chardet/lookup` helper to map `Result.Charset` to `golang.org/x/text/encoding`:
+
+```go
+package main
+
+import (
+ "fmt"
+
+ "github.com/wlynxg/chardet"
+ "github.com/wlynxg/chardet/lookup"
+)
+
+func main() {
+ data := []byte("Your text data here...")
+ result := chardet.Detect(data)
+
+ enc, err := lookup.LookupEncoding(result.Charset)
+ if err != nil {
+ panic(err)
+ }
+ if enc == nil {
+ fmt.Printf("no decoder for %s\n", result.Charset)
+ return
+ }
+
+ decoded, err := enc.NewDecoder().String(string(data))
+ if err != nil {
+ panic(err)
+ }
+
+ fmt.Println(decoded)
+}
+
*** End Patch
6 changes: 1 addition & 5 deletions chardet.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,7 @@ func DetectAll(buf []byte) []Result {
charsetName = n
}
}
results = append(results, Result{
Encoding: charsetName,
Confidence: setProbe.GetConfidence(),
Language: setProbe.Language(),
})
results = append(results, newResult(charsetName, setProbe.GetConfidence(), setProbe.Language()))
}
}

Expand Down
34 changes: 34 additions & 0 deletions consts/charset.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package consts

var legacyToCanonical = map[string]string{
Ascii: "US-ASCII",
ShiftJis: "Shift_JIS",
Johab: "KS_C_5601-1987",
MacRoman: "macintosh",
MacCyrillic: "x-mac-cyrillic",
}

var canonicalToLegacy map[string]string

func init() {
canonicalToLegacy = make(map[string]string, len(legacyToCanonical))
for legacy, canonical := range legacyToCanonical {
canonicalToLegacy[canonical] = legacy
}
}

// CanonicalCharset returns the IANA-compliant charset name for the provided legacy encoding name.
func CanonicalCharset(name string) string {
if canonical, ok := legacyToCanonical[name]; ok {
return canonical
}
return name
}

// LegacyCharset returns the legacy encoding name for the provided canonical charset.
func LegacyCharset(name string) string {
if legacy, ok := canonicalToLegacy[name]; ok {
return legacy
}
return name
}
38 changes: 8 additions & 30 deletions detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (
type Result struct {
// Encoding is the detected character encoding name
Encoding string `json:"encoding,omitempty"`
// Charset is the detected charset name using IANA-compliant naming
Charset string `json:"charset,omitempty"`
// Confidence indicates how confident the detector is about the result (0.0-1.0)
Confidence float64 `json:"confidence,omitempty"`
// Language represents the detected language (if applicable)
Expand Down Expand Up @@ -125,11 +127,7 @@ func (u *UniversalDetector) Feed(buf []byte) bool {
}
u.gotData = true
if encoding != "" {
u.result = Result{
Encoding: encoding,
Confidence: 1.0,
Language: "",
}
u.result = newResult(encoding, 1.0, "")
u.done = true
return false
}
Expand All @@ -155,11 +153,7 @@ func (u *UniversalDetector) Feed(buf []byte) bool {

if u.utf1632Probe.State() == consts.DetectingProbingState {
if u.utf1632Probe.Feed(buf) == consts.FoundItProbingState {
u.result = Result{
Encoding: u.utf1632Probe.CharSetName(),
Confidence: u.utf1632Probe.GetConfidence(),
Language: "",
}
u.result = newResult(u.utf1632Probe.CharSetName(), u.utf1632Probe.GetConfidence(), "")
u.done = true
return false
}
Expand All @@ -176,11 +170,7 @@ func (u *UniversalDetector) Feed(buf []byte) bool {
}

if u.escCharsetProbe.Feed(buf) == consts.FoundItProbingState {
u.result = Result{
Encoding: u.escCharsetProbe.CharSetName(),
Confidence: u.escCharsetProbe.GetConfidence(),
Language: u.escCharsetProbe.Language(),
}
u.result = newResult(u.escCharsetProbe.CharSetName(), u.escCharsetProbe.GetConfidence(), u.escCharsetProbe.Language())
u.done = true
}
case consts.HighByteInputState:
Expand All @@ -205,11 +195,7 @@ func (u *UniversalDetector) Feed(buf []byte) bool {
}

if charsetProbe.Feed(buf) == consts.FoundItProbingState {
u.result = Result{
Encoding: charsetProbe.CharSetName(),
Confidence: charsetProbe.GetConfidence(),
Language: charsetProbe.Language(),
}
u.result = newResult(charsetProbe.CharSetName(), charsetProbe.GetConfidence(), charsetProbe.Language())
u.done = true
break
}
Expand All @@ -234,11 +220,7 @@ func (u *UniversalDetector) GetResult() Result {
switch {
case !u.gotData:
case u.inputState == consts.PureAsciiInputState:
u.result = Result{
Encoding: consts.Ascii,
Confidence: 1.0,
Language: "",
}
u.result = newResult(consts.Ascii, 1.0, "")
case u.inputState == consts.HighByteInputState:
var (
confidence, maxProbeConfidence float64
Expand Down Expand Up @@ -268,11 +250,7 @@ func (u *UniversalDetector) GetResult() Result {
charsetName = n
}
}
u.result = Result{
Encoding: charsetName,
Confidence: confidence,
Language: maxConfidenceProbe.Language(),
}
u.result = newResult(charsetName, confidence, maxConfidenceProbe.Language())
}
}
return u.result
Expand Down
4 changes: 3 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
module github.com/wlynxg/chardet

go 1.23.0
go 1.21

require golang.org/x/text v0.22.0
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
Loading