Skip to content

Commit 15bb131

Browse files
ajnavarrodennwc
authored andcommitted
Refactor Oniguruma integration
Instead of use a command to change imports before build, using a build tag to generate the correct binary. This will allow applications to compile enry using oniguruma with less troubles. Signed-off-by: Antonio Jesus Navarro Perez <[email protected]>
1 parent 8da8516 commit 15bb131

File tree

8 files changed

+71
-52
lines changed

8 files changed

+71
-52
lines changed

.travis.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,7 @@ install:
2424
- mkdir -p $GOPATH/src/gopkg.in/src-d
2525
- ln -s $PWD $GOPATH/src/gopkg.in/src-d/enry.v1
2626
- cd $GOPATH/src/gopkg.in/src-d/enry.v1
27-
- if [ "$ONIGURUMA" == "1" ]; then make oniguruma; fi
28-
- go get -v -t ./...
29-
27+
- if [ "$ONIGURUMA" == "1" ]; then tags="$tags oniguruma"; fi; go get -v -t --tags "$tags" ./...
3028
script:
3129
- make test-coverage
3230

@@ -100,7 +98,7 @@ jobs:
10098
- sudo apt-get update
10199
- sudo apt-get install -y --no-install-recommends clang g++ gcc gcc-multilib libc6-dev libc6-dev-i386 mingw-w64 patch xz-utils
102100
- cd ${HOME}
103-
- curl -sSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf -
101+
- curl -sSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf -
104102
- cd $GOPATH/src/gopkg.in/src-d/enry.v1
105103

106104
script:

Makefile

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,6 @@ DARWIN_SHARED_LIB=$(DARWIN_DIR)/libenry.dylib
3838
HEADER_FILE=libenry.h
3939
NATIVE_LIB=./shared/enry.go
4040

41-
# source files to be patched for using "rubex" instead of "regexp"
42-
RUBEX_PATCHED := internal/code-generator/generator/heuristics.go internal/tokenizer/tokenize.go common.go
43-
RUBEX_ORIG := $(RUBEX_PATCHED:=.orig)
44-
45-
.PHONY: revert-oniguruma
46-
4741
$(LINGUIST_PATH):
4842
git clone https://github.com/github/linguist.git $@
4943

@@ -69,15 +63,6 @@ benchmarks-slow: $(LINGUST_PATH)
6963
mkdir -p benchmarks/output && go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h >benchmarks/output/enry_samples.bench && \
7064
benchmarks/linguist-samples.rb 5 >benchmarks/output/linguist_samples.bench
7165

72-
$(RUBEX_ORIG): %.orig : %
73-
sed -i.orig -e 's/"regexp"/regexp "github.com\/moovweb\/rubex"/g' $<
74-
@touch $@
75-
76-
oniguruma: $(RUBEX_ORIG)
77-
78-
revert-oniguruma:
79-
@for file in $(RUBEX_PATCHED); do if [ -e "$$file.orig" ]; then mv "$$file.orig" "$$file" && echo mv "$$file.orig" "$$file"; fi; done
80-
8166
build-cli:
8267
go build -o enry -ldflags "$(LOCAL_LDFLAGS)" cmd/enry/main.go
8368

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ On Ubuntu, it is
3737
sudo apt install libonig-dev
3838
```
3939

40-
To build enry with Oniguruma regexps, patch the imports with
40+
To build enry with Oniguruma regexps use the `oniguruma` build tag
4141

4242
```
43-
make oniguruma
43+
go get -v -t --tags oniguruma ./...
4444
```
4545

4646
and then rebuild the project.
@@ -162,7 +162,7 @@ We update enry when changes are done in linguist's master branch on the followin
162162
* [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml)
163163
* [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml)
164164

165-
Currently we don't have any procedure established to automatically detect changes in the linguist project and regenerate the code.
165+
Currently we don't have any procedure established to automatically detect changes in the linguist project and regenerate the code.
166166
So we update the generated code as needed, without any specific criteria.
167167

168168
If you want to update *enry* because of changes in linguist, you can run the *go
@@ -217,7 +217,7 @@ If you want to reproduce the same benchmarks you can run:
217217

218218
benchmarks/run.sh
219219

220-
from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram).
220+
from the root's project directory and it'll run benchmarks for enry and linguist, parse the output, create csv files and create a histogram (you must have installed [gnuplot](http://gnuplot.info) in your system to get the histogram).
221221

222222
This can take some time, so to run local benchmarks for a quick check you can either:
223223

common.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@ import (
44
"bufio"
55
"bytes"
66
"path/filepath"
7-
"regexp"
87
"strings"
98

109
"gopkg.in/src-d/enry.v1/data"
10+
"gopkg.in/src-d/enry.v1/regex"
1111
)
1212

1313
// OtherLanguage is used as a zero value when a function can not return a specific language.
@@ -197,10 +197,10 @@ func footScope(content []byte, scope int) (index int) {
197197
}
198198

199199
var (
200-
reEmacsModeline = regexp.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
201-
reEmacsLang = regexp.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
202-
reVimModeline = regexp.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
203-
reVimLang = regexp.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
200+
reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
201+
reEmacsLang = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
202+
reVimModeline = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
203+
reVimLang = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
204204
)
205205

206206
// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
@@ -283,8 +283,8 @@ func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []st
283283
}
284284

285285
var (
286-
shebangExecHack = regexp.MustCompile(`exec (\w+).+\$0.+\$@`)
287-
pythonVersion = regexp.MustCompile(`python\d\.\d+`)
286+
shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
287+
pythonVersion = regex.MustCompile(`python\d\.\d+`)
288288
)
289289

290290
func getInterpreter(data []byte) (interpreter string) {

internal/code-generator/generator/heuristics.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@ import (
66
"fmt"
77
"io"
88
"io/ioutil"
9-
"regexp"
109
"strconv"
1110
"strings"
1211
"text/template"
12+
13+
"gopkg.in/src-d/enry.v1/regex"
1314
)
1415

1516
// Heuristics reads from fileToParse and builds source file from tmplPath. It complies with type File signature.
@@ -38,7 +39,7 @@ const (
3839
)
3940

4041
var (
41-
disambLine = regexp.MustCompile(`^(\s*)disambiguate`)
42+
disambLine = regex.MustCompile(`^(\s*)disambiguate`)
4243
definedRegs = make(map[string]string)
4344
illegalCharacter = map[string]string{
4445
"#": "Sharp",
@@ -378,7 +379,7 @@ func convertToValidRegexp(reg string) string {
378379
func includeToRegExp(include string) string {
379380
content := include[strings.Index(include, `(`)+1 : strings.Index(include, `)`)]
380381
content = strings.Trim(content, `"'`)
381-
return regexp.QuoteMeta(content)
382+
return regex.QuoteMeta(content)
382383
}
383384

384385
func getLanguages(line string) []string {

internal/tokenizer/tokenize.go

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ package tokenizer
22

33
import (
44
"bytes"
5-
"regexp"
5+
6+
"gopkg.in/src-d/enry.v1/regex"
67
)
78

89
const byteLimit = 100000
@@ -72,20 +73,20 @@ var (
7273
//
7374
// These regexps were converted to work in the same way for both engines:
7475
//
75-
reLiteralStringQuotes = regexp.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
76-
reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
77-
reMultilineComment = regexp.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
78-
reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
79-
reShebang = regexp.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
80-
rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
81-
reSGML = regexp.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
82-
reSGMLComment = regexp.MustCompile(`(<!--(.|\n)*?-->)`)
83-
reSGMLAttributes = regexp.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
84-
reSGMLLoneAttribute = regexp.MustCompile(`([0-9A-Za-z_]+)`)
85-
reRegularToken = regexp.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
86-
reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
87-
88-
regexToSkip = []*regexp.Regexp{
76+
reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
77+
reSingleLineComment = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
78+
reMultilineComment = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
79+
reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
80+
reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
81+
rePunctuation = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
82+
reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
83+
reSGMLComment = regex.MustCompile(`(<!--(.|\n)*?-->)`)
84+
reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
85+
reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`)
86+
reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
87+
reOperators = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
88+
89+
regexToSkip = []regex.EnryRegexp{
8990
// The order must be this
9091
reLiteralStringQuotes,
9192
reMultilineComment,
@@ -124,22 +125,22 @@ func getShebangToken(matchedShebang [][]byte) []byte {
124125
return tokenShebang
125126
}
126127

127-
func commonExtracAndReplace(content []byte, re *regexp.Regexp) ([]byte, [][]byte) {
128+
func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
128129
tokens := re.FindAll(content, -1)
129130
content = re.ReplaceAll(content, []byte(` `))
130131
return content, tokens
131132
}
132133

133134
func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
134-
return commonExtracAndReplace(content, rePunctuation)
135+
return commonExtractAndReplace(content, rePunctuation)
135136
}
136137

137138
func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
138-
return commonExtracAndReplace(content, reRegularToken)
139+
return commonExtractAndReplace(content, reRegularToken)
139140
}
140141

141142
func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
142-
return commonExtracAndReplace(content, reOperators)
143+
return commonExtractAndReplace(content, reOperators)
143144
}
144145

145146
func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {

regex/oniguruma.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// +build oniguruma
2+
3+
package regex
4+
5+
import (
6+
"github.com/moovweb/rubex"
7+
)
8+
9+
type EnryRegexp = *rubex.Regexp
10+
11+
func MustCompile(str string) EnryRegexp {
12+
return rubex.MustCompile(str)
13+
}
14+
15+
func QuoteMeta(s string) string {
16+
return rubex.QuoteMeta(s)
17+
}

regex/standard.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// +build !oniguruma
2+
3+
package regex
4+
5+
import (
6+
"regexp"
7+
)
8+
9+
type EnryRegexp = *regexp.Regexp
10+
11+
func MustCompile(str string) EnryRegexp {
12+
return regexp.MustCompile(str)
13+
}
14+
15+
func QuoteMeta(s string) string {
16+
return regexp.QuoteMeta(s)
17+
}

0 commit comments

Comments
 (0)