Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions internal/gitname/canon.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package gitname

import (
"net/url"
"strings"
)

// Canon will canonicalize the a git repository name as returned by Parse.
//
// The method adjusts it's behaviour based on the hostname to ensure that
// URLs are correctly canonicalized for each git hosting provider.
//
// Any password present in the URL will be stripped.
func Canon(name *url.URL) *url.URL {
u := *name // shallow copy

u.Host = strings.ToLower(u.Host)

// We can only adjust the hosts we are aware of.
if handler := handlerForHost(u.Host); handler != nil {
handler.Canon(&u)
}

// Always strip passwords if they are present.
if _, ok := u.User.Password(); ok {
u.User = url.User(u.User.Username())
}

return &u
}

// CanonForStorage canonicalizes the git repository name and ensures that it
// is nicely formatted for use in output.
//
// The scheme is dropped from the URL.
// If the username is "git" it is dropped.
// The ".git" suffix is removed.
//
// If the repository name is invalid, the string is returned without changes.
func CanonForStorage(name string) string {
u, err := Parse(name)
if err != nil {
return name
}

u = Canon(u)

u.Scheme = ""

if u.User.Username() == "git" {
u.User = nil
}

u.Path, _ = strings.CutSuffix(u.Path, ".git")

canon := u.String()
canon = canon[2:] // Strip "//" prefix
return canon
}
56 changes: 56 additions & 0 deletions internal/gitname/canon_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package gitname_test

import (
"net/url"
"testing"

"github.com/ossf/malicious-packages/internal/gitname"
)

func TestCanon(t *testing.T) {
tests := map[string]string{
"https://github.com/org/repo.git": "https://github.com/org/repo.git",
"https://github.com/org/repo": "https://github.com/org/repo.git",
"http://github.com/org/repo.git": "https://github.com/org/repo.git",
"ssh://[email protected]/Org/Repo.git": "https://github.com/org/repo.git",
"ssh://[email protected]/org/REPO.git": "https://gitlab.com/org/repo.git",
"https://go.googlesource.com/go": "https://go.googlesource.com/go",
"https://gitee.com/ignOre/CASE.git": "https://gitee.com/ignore/CASE.git",
"git://user:[email protected]/repo.git": "git://[email protected]/repo.git",
}
for repo, want := range tests {
t.Run(repo, func(t *testing.T) {
u, err := url.Parse(repo)
if err != nil {
t.Fatalf("url.Parse() = %v; want no error", err)
}
got := gitname.Canon(u).String()
if got != want {
t.Fatalf("Canon() = %q; want %q", got, want)
}
})
}
}

func TestCanonForStorage(t *testing.T) {
tests := map[string]string{
"invalid:": "invalid:",
"invalid": "invalid",
"ftp://invalid.com/repo.git": "ftp://invalid.com/repo.git",
"https://github.com/org/repo.git": "github.com/org/repo",
"[email protected]:org/repo.git": "github.com/org/repo",
"ssh://[email protected]/Org/Repo.git": "github.com/org/repo",
"https://go.googlesource.com/go": "go.googlesource.com/go",
"[email protected]:path/to/repo.git": "[email protected]/path/to/repo",
"[email protected]:ignOre/CASE.git": "gitee.com/ignore/CASE",
"[email protected]:repo.git": "example.com/repo",
}
for repo, want := range tests {
t.Run(repo, func(t *testing.T) {
got := gitname.CanonForStorage(repo)
if got != want {
t.Fatalf("CanonForStorage() = %q; want %q", got, want)
}
})
}
}
141 changes: 141 additions & 0 deletions internal/gitname/hosts.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
package gitname

import (
"net/url"
"strings"
)

type gitHostHandler struct {
CheckPath func(string) bool
CanonScheme string
CanonPath func(string) string
KeepUser bool
EnsureGitExt bool
}

// Canon canonicalizes the supplied url for a specific git hosting service
// based on the configuration in the gitHostHandler.
func (h *gitHostHandler) Canon(u *url.URL) {
// Replace the scheme if we have an override.
if h.CanonScheme != "" {
u.Scheme = h.CanonScheme
}

// Fix the path.
u.Path = h.CanonPath(u.Path)

// Strip the user if we should not keep it.
if !h.KeepUser {
u.User = nil
}

// Ensure the .git extension is always present.
if h.EnsureGitExt && !strings.HasSuffix(u.Path, ".git") {
u.Path += ".git"
}
}

// defaultGitHost covers common git hosting services that have a url structure
// of "example.com/org/repo.git", where "org" and "repo" are case-insensitive.
var defaultGitHost = &gitHostHandler{
CheckPath: checkOrgRepoPath,
CanonScheme: "https",
CanonPath: strings.ToLower,
KeepUser: false,
EnsureGitExt: true,
}

// sensitiveRepoGitHost is similar to defaultGitHost, except it preserves the
// case on the "repo" part of the URL.
var sensitiveRepoGitHost = &gitHostHandler{
CheckPath: checkOrgRepoPath,
CanonScheme: "https",
CanonPath: canonLowerOrgPath,
KeepUser: false,
EnsureGitExt: true,
}

// googlesourceGitHost is specifically for .googlesource.com git repositories.
var googlesourceGitHost = &gitHostHandler{
CheckPath: checkRepoOnlyPath,
CanonScheme: "https",
CanonPath: strings.ToLower,
KeepUser: false,
EnsureGitExt: false,
}

// gitHosts maps either entire host matches or host suffixes to a gitHostHandler
// instance.
// Any key starting with a "." will be checked as a suffix. The order the
// suffixes are checked is random.
var gitHosts = map[string]*gitHostHandler{
".googlesource.com": googlesourceGitHost,
"github.com": defaultGitHost,
"gitlab.com": defaultGitHost,
"bitbucket.org": defaultGitHost,
"codeberg.org": defaultGitHost,
"gitee.com": sensitiveRepoGitHost,
"gitee.cn": sensitiveRepoGitHost,
}

func handlerForHost(host string) *gitHostHandler {
if handler, ok := gitHosts[host]; ok {
// There is a direct match, so return the handler immediately.
return handler
}
for suffix, handler := range gitHosts {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess iteration order will be random, if we ever have two suffix patterns that would match.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is true. I have added a comment above to make this clearer.

For now I am okay with this.

if suffix[0] != '.' {
// The suffix must start with a "." to ensure subdomains are
// matched correctly.
continue
}
if strings.HasSuffix(host, suffix) {
// The suffix matches the given host, so return the handler.
return handler
}
}
return nil
}

// checkRepoOnlyPath ensures that the path being supplied only has one path
// component.
func checkRepoOnlyPath(path string) bool {
return checkPathParts(path, 1)
}

// checkOrgRepoPath ensures that the path being supplied only has two path
// components.
func checkOrgRepoPath(path string) bool {
return checkPathParts(path, 2)
}

// checkPathParts ensures that path only contains count number of components.
// Initial slashes are ignored. Component parts must not be empty.
func checkPathParts(path string, count int) bool {
tail := strings.TrimLeft(path, "/")
parts := strings.Split(tail, "/")
if len(parts) != count {
return false
}
for _, p := range parts {
if p == "" {
return false
}
}
return true
}

// canonLowerOrgPath lowercases the first path component in the supplied path.
func canonLowerOrgPath(path string) string {
parts := strings.Split(path, "/")
for i := 0; i < len(parts); i++ {
p := parts[i]
if len(p) == 0 {
// Skip empty parts.
continue
}
parts[i] = strings.ToLower(p)
break
}
return strings.Join(parts, "/")
}
109 changes: 109 additions & 0 deletions internal/gitname/parse.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package gitname

import (
"errors"
"fmt"
"net/url"
"slices"
"strings"
)

// ErrInvalidGitRepo is wrapped by any errors returned by Parse.
var ErrInvalidGitRepo = errors.New("invalid git repository")

var validGitRemoteSchemes = []string{
"http",
"https",
"ssh",
"git",
}

// Parse parses a git repository name into a url.URL. If the name cannot be
// parsed an error will be returned, and the url will be nil.
//
// Both URL and SCP-like git repository names are supported.
func Parse(name string) (*url.URL, error) {
u, err := url.Parse(name)
if err == nil {
// Apply some further validation to the parsed URL for of the repo.
err = validateURLRepo(u)
}
if err != nil {
// Assume if we still have an error we *might* have an SSH-based repo.
u, err = parseSSH(name)
if err != nil {
return nil, err
}
}
if handler := handlerForHost(u.Host); handler != nil && !handler.CheckPath(u.Path) {
return nil, fmt.Errorf("%w: invalid path for host %q", ErrInvalidGitRepo, u.Host)
}
return u, nil
}

func parseSSH(name string) (*url.URL, error) {
// Hunt for the end of an IPv6 address first, to avoid matching the colons
// in the IPv6 path itself.
ipv6End := strings.Index(name, "]:")
pathIdx := 0

if ipv6End >= 0 {
// Skip the separator "]:"
pathIdx = ipv6End + 2
} else {
i := strings.Index(name, ":")
if i < 0 {
return nil, fmt.Errorf("%w: no path separator", ErrInvalidGitRepo)
}
// Skip the separator ":"
pathIdx = i + 1
}

path := name[pathIdx:]
if len(path) == 0 {
return nil, fmt.Errorf("%w: empty path", ErrInvalidGitRepo)
} else if path[0] == '/' {
return nil, fmt.Errorf("%w: absolute path", ErrInvalidGitRepo)
}
// TODO: should we force a ".git" suffix?

userHost := name[0 : pathIdx-1]
if len(userHost) == 0 {
return nil, fmt.Errorf("%w: no user or host", ErrInvalidGitRepo)
}

// Build a raw URL string that we parse later from the components of the
// Git scp-like repository.
raw := "ssh://"

userEnd := strings.LastIndex(userHost, "@")
switch {
case userEnd == 0:
return nil, fmt.Errorf("%w: empty user", ErrInvalidGitRepo)
case userEnd == len(userHost)-1:
return nil, fmt.Errorf("%w: empty host", ErrInvalidGitRepo)
default:
raw += userHost
}

raw += "/" + path
u, err := url.Parse(raw)
if err != nil {
return nil, fmt.Errorf("%w: %w", ErrInvalidGitRepo, err)
}

return u, nil
}

func validateURLRepo(u *url.URL) error {
if !slices.Contains(validGitRemoteSchemes, u.Scheme) {
return fmt.Errorf("%w: unsupported git scheme", ErrInvalidGitRepo)
}
if u.Host == "" {
return fmt.Errorf("%w: empty host", ErrInvalidGitRepo)
}
if u.Path == "" || u.Path == "/" {
return fmt.Errorf("%w: empty path", ErrInvalidGitRepo)
}
return nil
}
Loading
Loading