Skip to content

Commit 64e59e3

Browse files
authored
Implementation of Git repository support for the Malicious Packages repository. (#1047)
* Commit WIP for git-repo support Signed-off-by: Caleb Brown <[email protected]> * Implement initial git-based repository support. Signed-off-by: Caleb Brown <[email protected]> * Improve documentation and shrink the gitname API. Signed-off-by: Caleb Brown <[email protected]> * Fix lint errors Signed-off-by: Caleb Brown <[email protected]> * Fix the nolint lint warning Signed-off-by: Caleb Brown <[email protected]> * Improve the git-based report testing and fix some edge cases. Signed-off-by: Caleb Brown <[email protected]> * Add initial doc changes for Git repo implementation. Signed-off-by: Caleb Brown <[email protected]> * Minor readme change. Signed-off-by: Caleb Brown <[email protected]> * Move git support doc changes out to another PR. This allows doc changes to be iterated on differently. Signed-off-by: Caleb Brown <[email protected]> * Allow versions for git repos. Require versions OR ranges otherwise. Also adds tests to ensure the new validation rules work as expected. Signed-off-by: Caleb Brown <[email protected]> * Respond to code review comments. Signed-off-by: Caleb Brown <[email protected]> --------- Signed-off-by: Caleb Brown <[email protected]>
1 parent e8c8ce2 commit 64e59e3

File tree

9 files changed

+764
-27
lines changed

9 files changed

+764
-27
lines changed

internal/gitname/canon.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
package gitname
2+
3+
import (
4+
"net/url"
5+
"strings"
6+
)
7+
8+
// Canon will canonicalize the a git repository name as returned by Parse.
9+
//
10+
// The method adjusts it's behaviour based on the hostname to ensure that
11+
// URLs are correctly canonicalized for each git hosting provider.
12+
//
13+
// Any password present in the URL will be stripped.
14+
func Canon(name *url.URL) *url.URL {
15+
u := *name // shallow copy
16+
17+
u.Host = strings.ToLower(u.Host)
18+
19+
// We can only adjust the hosts we are aware of.
20+
if handler := handlerForHost(u.Host); handler != nil {
21+
handler.Canon(&u)
22+
}
23+
24+
// Always strip passwords if they are present.
25+
if _, ok := u.User.Password(); ok {
26+
u.User = url.User(u.User.Username())
27+
}
28+
29+
return &u
30+
}
31+
32+
// CanonForStorage canonicalizes the git repository name and ensures that it
33+
// is nicely formatted for use in output.
34+
//
35+
// The scheme is dropped from the URL.
36+
// If the username is "git" it is dropped.
37+
// The ".git" suffix is removed.
38+
//
39+
// If the repository name is invalid, the string is returned without changes.
40+
func CanonForStorage(name string) string {
41+
u, err := Parse(name)
42+
if err != nil {
43+
return name
44+
}
45+
46+
u = Canon(u)
47+
48+
u.Scheme = ""
49+
50+
if u.User.Username() == "git" {
51+
u.User = nil
52+
}
53+
54+
u.Path, _ = strings.CutSuffix(u.Path, ".git")
55+
56+
canon := u.String()
57+
canon = canon[2:] // Strip "//" prefix
58+
return canon
59+
}

internal/gitname/canon_test.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
package gitname_test
2+
3+
import (
4+
"net/url"
5+
"testing"
6+
7+
"github.com/ossf/malicious-packages/internal/gitname"
8+
)
9+
10+
func TestCanon(t *testing.T) {
11+
tests := map[string]string{
12+
"https://github.com/org/repo.git": "https://github.com/org/repo.git",
13+
"https://github.com/org/repo": "https://github.com/org/repo.git",
14+
"http://github.com/org/repo.git": "https://github.com/org/repo.git",
15+
"ssh://[email protected]/Org/Repo.git": "https://github.com/org/repo.git",
16+
"ssh://[email protected]/org/REPO.git": "https://gitlab.com/org/repo.git",
17+
"https://go.googlesource.com/go": "https://go.googlesource.com/go",
18+
"https://gitee.com/ignOre/CASE.git": "https://gitee.com/ignore/CASE.git",
19+
"git://user:[email protected]/repo.git": "git://[email protected]/repo.git",
20+
}
21+
for repo, want := range tests {
22+
t.Run(repo, func(t *testing.T) {
23+
u, err := url.Parse(repo)
24+
if err != nil {
25+
t.Fatalf("url.Parse() = %v; want no error", err)
26+
}
27+
got := gitname.Canon(u).String()
28+
if got != want {
29+
t.Fatalf("Canon() = %q; want %q", got, want)
30+
}
31+
})
32+
}
33+
}
34+
35+
func TestCanonForStorage(t *testing.T) {
36+
tests := map[string]string{
37+
"invalid:": "invalid:",
38+
"invalid": "invalid",
39+
"ftp://invalid.com/repo.git": "ftp://invalid.com/repo.git",
40+
"https://github.com/org/repo.git": "github.com/org/repo",
41+
"[email protected]:org/repo.git": "github.com/org/repo",
42+
"ssh://[email protected]/Org/Repo.git": "github.com/org/repo",
43+
"https://go.googlesource.com/go": "go.googlesource.com/go",
44+
"[email protected]:path/to/repo.git": "[email protected]/path/to/repo",
45+
"[email protected]:ignOre/CASE.git": "gitee.com/ignore/CASE",
46+
"[email protected]:repo.git": "example.com/repo",
47+
}
48+
for repo, want := range tests {
49+
t.Run(repo, func(t *testing.T) {
50+
got := gitname.CanonForStorage(repo)
51+
if got != want {
52+
t.Fatalf("CanonForStorage() = %q; want %q", got, want)
53+
}
54+
})
55+
}
56+
}

internal/gitname/hosts.go

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
package gitname
2+
3+
import (
4+
"net/url"
5+
"strings"
6+
)
7+
8+
type gitHostHandler struct {
9+
CheckPath func(string) bool
10+
CanonScheme string
11+
CanonPath func(string) string
12+
KeepUser bool
13+
EnsureGitExt bool
14+
}
15+
16+
// Canon canonicalizes the supplied url for a specific git hosting service
17+
// based on the configuration in the gitHostHandler.
18+
func (h *gitHostHandler) Canon(u *url.URL) {
19+
// Replace the scheme if we have an override.
20+
if h.CanonScheme != "" {
21+
u.Scheme = h.CanonScheme
22+
}
23+
24+
// Fix the path.
25+
u.Path = h.CanonPath(u.Path)
26+
27+
// Strip the user if we should not keep it.
28+
if !h.KeepUser {
29+
u.User = nil
30+
}
31+
32+
// Ensure the .git extension is always present.
33+
if h.EnsureGitExt && !strings.HasSuffix(u.Path, ".git") {
34+
u.Path += ".git"
35+
}
36+
}
37+
38+
// defaultGitHost covers common git hosting services that have a url structure
39+
// of "example.com/org/repo.git", where "org" and "repo" are case-insensitive.
40+
var defaultGitHost = &gitHostHandler{
41+
CheckPath: checkOrgRepoPath,
42+
CanonScheme: "https",
43+
CanonPath: strings.ToLower,
44+
KeepUser: false,
45+
EnsureGitExt: true,
46+
}
47+
48+
// sensitiveRepoGitHost is similar to defaultGitHost, except it preserves the
49+
// case on the "repo" part of the URL.
50+
var sensitiveRepoGitHost = &gitHostHandler{
51+
CheckPath: checkOrgRepoPath,
52+
CanonScheme: "https",
53+
CanonPath: canonLowerOrgPath,
54+
KeepUser: false,
55+
EnsureGitExt: true,
56+
}
57+
58+
// googlesourceGitHost is specifically for .googlesource.com git repositories.
59+
var googlesourceGitHost = &gitHostHandler{
60+
CheckPath: checkRepoOnlyPath,
61+
CanonScheme: "https",
62+
CanonPath: strings.ToLower,
63+
KeepUser: false,
64+
EnsureGitExt: false,
65+
}
66+
67+
// gitHosts maps either entire host matches or host suffixes to a gitHostHandler
68+
// instance.
69+
// Any key starting with a "." will be checked as a suffix. The order the
70+
// suffixes are checked is random.
71+
var gitHosts = map[string]*gitHostHandler{
72+
".googlesource.com": googlesourceGitHost,
73+
"github.com": defaultGitHost,
74+
"gitlab.com": defaultGitHost,
75+
"bitbucket.org": defaultGitHost,
76+
"codeberg.org": defaultGitHost,
77+
"gitee.com": sensitiveRepoGitHost,
78+
"gitee.cn": sensitiveRepoGitHost,
79+
}
80+
81+
func handlerForHost(host string) *gitHostHandler {
82+
if handler, ok := gitHosts[host]; ok {
83+
// There is a direct match, so return the handler immediately.
84+
return handler
85+
}
86+
for suffix, handler := range gitHosts {
87+
if suffix[0] != '.' {
88+
// The suffix must start with a "." to ensure subdomains are
89+
// matched correctly.
90+
continue
91+
}
92+
if strings.HasSuffix(host, suffix) {
93+
// The suffix matches the given host, so return the handler.
94+
return handler
95+
}
96+
}
97+
return nil
98+
}
99+
100+
// checkRepoOnlyPath ensures that the path being supplied only has one path
101+
// component.
102+
func checkRepoOnlyPath(path string) bool {
103+
return checkPathParts(path, 1)
104+
}
105+
106+
// checkOrgRepoPath ensures that the path being supplied only has two path
107+
// components.
108+
func checkOrgRepoPath(path string) bool {
109+
return checkPathParts(path, 2)
110+
}
111+
112+
// checkPathParts ensures that path only contains count number of components.
113+
// Initial slashes are ignored. Component parts must not be empty.
114+
func checkPathParts(path string, count int) bool {
115+
tail := strings.TrimLeft(path, "/")
116+
parts := strings.Split(tail, "/")
117+
if len(parts) != count {
118+
return false
119+
}
120+
for _, p := range parts {
121+
if p == "" {
122+
return false
123+
}
124+
}
125+
return true
126+
}
127+
128+
// canonLowerOrgPath lowercases the first path component in the supplied path.
129+
func canonLowerOrgPath(path string) string {
130+
parts := strings.Split(path, "/")
131+
for i := 0; i < len(parts); i++ {
132+
p := parts[i]
133+
if len(p) == 0 {
134+
// Skip empty parts.
135+
continue
136+
}
137+
parts[i] = strings.ToLower(p)
138+
break
139+
}
140+
return strings.Join(parts, "/")
141+
}

internal/gitname/parse.go

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
package gitname
2+
3+
import (
4+
"errors"
5+
"fmt"
6+
"net/url"
7+
"slices"
8+
"strings"
9+
)
10+
11+
// ErrInvalidGitRepo is wrapped by any errors returned by Parse.
12+
var ErrInvalidGitRepo = errors.New("invalid git repository")
13+
14+
var validGitRemoteSchemes = []string{
15+
"http",
16+
"https",
17+
"ssh",
18+
"git",
19+
}
20+
21+
// Parse parses a git repository name into a url.URL. If the name cannot be
22+
// parsed an error will be returned, and the url will be nil.
23+
//
24+
// Both URL and SCP-like git repository names are supported.
25+
func Parse(name string) (*url.URL, error) {
26+
u, err := url.Parse(name)
27+
if err == nil {
28+
// Apply some further validation to the parsed URL for of the repo.
29+
err = validateURLRepo(u)
30+
}
31+
if err != nil {
32+
// Assume if we still have an error we *might* have an SSH-based repo.
33+
u, err = parseSSH(name)
34+
if err != nil {
35+
return nil, err
36+
}
37+
}
38+
if handler := handlerForHost(u.Host); handler != nil && !handler.CheckPath(u.Path) {
39+
return nil, fmt.Errorf("%w: invalid path for host %q", ErrInvalidGitRepo, u.Host)
40+
}
41+
return u, nil
42+
}
43+
44+
func parseSSH(name string) (*url.URL, error) {
45+
// Hunt for the end of an IPv6 address first, to avoid matching the colons
46+
// in the IPv6 path itself.
47+
ipv6End := strings.Index(name, "]:")
48+
pathIdx := 0
49+
50+
if ipv6End >= 0 {
51+
// Skip the separator "]:"
52+
pathIdx = ipv6End + 2
53+
} else {
54+
i := strings.Index(name, ":")
55+
if i < 0 {
56+
return nil, fmt.Errorf("%w: no path separator", ErrInvalidGitRepo)
57+
}
58+
// Skip the separator ":"
59+
pathIdx = i + 1
60+
}
61+
62+
path := name[pathIdx:]
63+
if len(path) == 0 {
64+
return nil, fmt.Errorf("%w: empty path", ErrInvalidGitRepo)
65+
} else if path[0] == '/' {
66+
return nil, fmt.Errorf("%w: absolute path", ErrInvalidGitRepo)
67+
}
68+
// TODO: should we force a ".git" suffix?
69+
70+
userHost := name[0 : pathIdx-1]
71+
if len(userHost) == 0 {
72+
return nil, fmt.Errorf("%w: no user or host", ErrInvalidGitRepo)
73+
}
74+
75+
// Build a raw URL string that we parse later from the components of the
76+
// Git scp-like repository.
77+
raw := "ssh://"
78+
79+
userEnd := strings.LastIndex(userHost, "@")
80+
switch {
81+
case userEnd == 0:
82+
return nil, fmt.Errorf("%w: empty user", ErrInvalidGitRepo)
83+
case userEnd == len(userHost)-1:
84+
return nil, fmt.Errorf("%w: empty host", ErrInvalidGitRepo)
85+
default:
86+
raw += userHost
87+
}
88+
89+
raw += "/" + path
90+
u, err := url.Parse(raw)
91+
if err != nil {
92+
return nil, fmt.Errorf("%w: %w", ErrInvalidGitRepo, err)
93+
}
94+
95+
return u, nil
96+
}
97+
98+
func validateURLRepo(u *url.URL) error {
99+
if !slices.Contains(validGitRemoteSchemes, u.Scheme) {
100+
return fmt.Errorf("%w: unsupported git scheme", ErrInvalidGitRepo)
101+
}
102+
if u.Host == "" {
103+
return fmt.Errorf("%w: empty host", ErrInvalidGitRepo)
104+
}
105+
if u.Path == "" || u.Path == "/" {
106+
return fmt.Errorf("%w: empty path", ErrInvalidGitRepo)
107+
}
108+
return nil
109+
}

0 commit comments

Comments
 (0)