137 lines
3.6 KiB
Go
Vendored
137 lines
3.6 KiB
Go
Vendored
// Package chardet ports character set detection from ICU.
|
|
package chardet
|
|
|
|
import (
|
|
"errors"
|
|
"sort"
|
|
)
|
|
|
|
// Result contains all the information that charset detector gives.
|
|
type Result struct {
|
|
// IANA name of the detected charset.
|
|
Charset string
|
|
// IANA name of the detected language. It may be empty for some charsets.
|
|
Language string
|
|
// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
|
|
Confidence int
|
|
}
|
|
|
|
// Detector implements charset detection.
|
|
type Detector struct {
|
|
recognizers []recognizer
|
|
stripTag bool
|
|
}
|
|
|
|
// List of charset recognizers
|
|
var recognizers = []recognizer{
|
|
newRecognizer_utf8(),
|
|
newRecognizer_utf16be(),
|
|
newRecognizer_utf16le(),
|
|
newRecognizer_utf32be(),
|
|
newRecognizer_utf32le(),
|
|
newRecognizer_8859_1_en(),
|
|
newRecognizer_8859_1_da(),
|
|
newRecognizer_8859_1_de(),
|
|
newRecognizer_8859_1_es(),
|
|
newRecognizer_8859_1_fr(),
|
|
newRecognizer_8859_1_it(),
|
|
newRecognizer_8859_1_nl(),
|
|
newRecognizer_8859_1_no(),
|
|
newRecognizer_8859_1_pt(),
|
|
newRecognizer_8859_1_sv(),
|
|
newRecognizer_8859_2_cs(),
|
|
newRecognizer_8859_2_hu(),
|
|
newRecognizer_8859_2_pl(),
|
|
newRecognizer_8859_2_ro(),
|
|
newRecognizer_8859_5_ru(),
|
|
newRecognizer_8859_6_ar(),
|
|
newRecognizer_8859_7_el(),
|
|
newRecognizer_8859_8_I_he(),
|
|
newRecognizer_8859_8_he(),
|
|
newRecognizer_windows_1251(),
|
|
newRecognizer_windows_1256(),
|
|
newRecognizer_KOI8_R(),
|
|
newRecognizer_8859_9_tr(),
|
|
|
|
newRecognizer_sjis(),
|
|
newRecognizer_gb_18030(),
|
|
newRecognizer_euc_jp(),
|
|
newRecognizer_euc_kr(),
|
|
newRecognizer_big5(),
|
|
|
|
newRecognizer_2022JP(),
|
|
newRecognizer_2022KR(),
|
|
newRecognizer_2022CN(),
|
|
|
|
newRecognizer_IBM424_he_rtl(),
|
|
newRecognizer_IBM424_he_ltr(),
|
|
newRecognizer_IBM420_ar_rtl(),
|
|
newRecognizer_IBM420_ar_ltr(),
|
|
}
|
|
|
|
// NewTextDetector creates a Detector for plain text.
|
|
func NewTextDetector() *Detector {
|
|
return &Detector{recognizers, false}
|
|
}
|
|
|
|
// NewHtmlDetector creates a Detector for Html.
|
|
func NewHtmlDetector() *Detector {
|
|
return &Detector{recognizers, true}
|
|
}
|
|
|
|
var (
|
|
NotDetectedError = errors.New("Charset not detected.")
|
|
)
|
|
|
|
// DetectBest returns the Result with highest Confidence.
|
|
func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
|
|
var all []Result
|
|
if all, err = d.DetectAll(b); err == nil {
|
|
r = &all[0]
|
|
}
|
|
return
|
|
}
|
|
|
|
// DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order.
|
|
func (d *Detector) DetectAll(b []byte) ([]Result, error) {
|
|
input := newRecognizerInput(b, d.stripTag)
|
|
outputChan := make(chan recognizerOutput)
|
|
for _, r := range d.recognizers {
|
|
go matchHelper(r, input, outputChan)
|
|
}
|
|
outputs := make([]recognizerOutput, 0, len(d.recognizers))
|
|
for i := 0; i < len(d.recognizers); i++ {
|
|
o := <-outputChan
|
|
if o.Confidence > 0 {
|
|
outputs = append(outputs, o)
|
|
}
|
|
}
|
|
if len(outputs) == 0 {
|
|
return nil, NotDetectedError
|
|
}
|
|
|
|
sort.Sort(recognizerOutputs(outputs))
|
|
dedupOutputs := make([]Result, 0, len(outputs))
|
|
foundCharsets := make(map[string]struct{}, len(outputs))
|
|
for _, o := range outputs {
|
|
if _, found := foundCharsets[o.Charset]; !found {
|
|
dedupOutputs = append(dedupOutputs, Result(o))
|
|
foundCharsets[o.Charset] = struct{}{}
|
|
}
|
|
}
|
|
if len(dedupOutputs) == 0 {
|
|
return nil, NotDetectedError
|
|
}
|
|
return dedupOutputs, nil
|
|
}
|
|
|
|
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
|
|
outputChan <- r.Match(input)
|
|
}
|
|
|
|
type recognizerOutputs []recognizerOutput
|
|
|
|
func (r recognizerOutputs) Len() int { return len(r) }
|
|
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
|
|
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
|