Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions runner/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ type Options struct {
OutputMatchContentLength string
OutputFilterStatusCode string
OutputFilterErrorPage bool
FilterOutDuplicates bool
OutputFilterContentLength string
InputRawRequest string
rawRequest string
Expand Down Expand Up @@ -409,6 +410,7 @@ func ParseOptions() *Options {
flagSet.CreateGroup("filters", "Filters",
flagSet.StringVarP(&options.OutputFilterStatusCode, "filter-code", "fc", "", "filter response with specified status code (-fc 403,401)"),
flagSet.BoolVarP(&options.OutputFilterErrorPage, "filter-error-page", "fep", false, "filter response with ML based error page detection"),
flagSet.BoolVarP(&options.FilterOutDuplicates, "filter-duplicates", "fd", false, "filter out near-duplicate responses (only first response is retained)"),
flagSet.StringVarP(&options.OutputFilterContentLength, "filter-length", "fl", "", "filter response with specified content length (-fl 23,33)"),
flagSet.StringVarP(&options.OutputFilterLinesCount, "filter-line-count", "flc", "", "filter response body with specified line count (-flc 423,532)"),
flagSet.StringVarP(&options.OutputFilterWordsCount, "filter-word-count", "fwc", "", "filter response body with specified word count (-fwc 423,532)"),
Expand Down
46 changes: 38 additions & 8 deletions runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (

"github.com/PuerkitoBio/goquery"
"github.com/corona10/goimagehash"
"github.com/mfonda/simhash"
asnmap "github.com/projectdiscovery/asnmap/libs"
"github.com/projectdiscovery/fastdialer/fastdialer"
"github.com/projectdiscovery/httpx/common/customextract"
Expand Down Expand Up @@ -65,6 +66,7 @@ import (
"github.com/projectdiscovery/httpx/common/stringz"
"github.com/projectdiscovery/mapcidr"
"github.com/projectdiscovery/rawhttp"
converstionutil "github.com/projectdiscovery/utils/conversion"
fileutil "github.com/projectdiscovery/utils/file"
pdhttputil "github.com/projectdiscovery/utils/http"
iputil "github.com/projectdiscovery/utils/ip"
Expand All @@ -84,8 +86,9 @@ type Runner struct {
ratelimiter ratelimit.Limiter
HostErrorsCache gcache.Cache[string, int]
browser *Browser
pageTypeClassifier *pagetypeclassifier.PageTypeClassifier
pageTypeClassifier *pagetypeclassifier.PageTypeClassifier // Include this for general page classification
pHashClusters []pHashCluster
simHashes gcache.Cache[uint64, struct{}] // Include simHashes for efficient duplicate detection
httpApiEndpoint *Server
}

Expand Down Expand Up @@ -358,6 +361,7 @@ func New(options *Options) (*Runner, error) {
runner.HostErrorsCache = gc
}

runner.simHashes = gcache.New[uint64, struct{}](1000).ARC().Build()
runner.pageTypeClassifier = pagetypeclassifier.New()

if options.HttpApiEndpoint != "" {
Expand Down Expand Up @@ -438,7 +442,7 @@ func (r *Runner) prepareInput() {
// check if input target host(s) have been provided
if len(r.options.InputTargetHost) > 0 {
for _, target := range r.options.InputTargetHost {
expandedTarget := r.countTargetFromRawTarget(target)
expandedTarget, _ := r.countTargetFromRawTarget(target)
if expandedTarget > 0 {
numHosts += expandedTarget
r.hm.Set(target, nil) //nolint
Expand Down Expand Up @@ -514,6 +518,24 @@ func (r *Runner) seen(k string) bool {
return ok
}

func (r *Runner) duplicate(result *Result) bool {
respSimHash := simhash.Simhash(simhash.NewWordFeatureSet(converstionutil.Bytes(result.Raw)))
if r.simHashes.Has(respSimHash) {
gologger.Debug().Msgf("Skipping duplicate response with simhash %d for URL %s\n", respSimHash, result.URL)
return true
}

for simHash := range r.simHashes.GetALL(false) {
// lower threshold for increased precision
if simhash.Compare(simHash, respSimHash) <= 3 {
gologger.Debug().Msgf("Skipping near-duplicate response with simhash %d for URL %s\n", respSimHash, result.URL)
return true
}
}
_ = r.simHashes.Set(respSimHash, struct{}{})
return false
}

func (r *Runner) testAndSet(k string) bool {
// skip empty lines
k = strings.TrimSpace(k)
Expand Down Expand Up @@ -581,7 +603,7 @@ func (r *Runner) loadAndCloseFile(finput *os.File) (numTargets int, err error) {
for scanner.Scan() {
target := strings.TrimSpace(scanner.Text())
// Used just to get the exact number of targets
expandedTarget := r.countTargetFromRawTarget(target)
expandedTarget, _ := r.countTargetFromRawTarget(target)
if expandedTarget > 0 {
numTargets += expandedTarget
r.hm.Set(target, nil) //nolint
Expand All @@ -591,12 +613,12 @@ func (r *Runner) loadAndCloseFile(finput *os.File) (numTargets int, err error) {
return numTargets, err
}

func (r *Runner) countTargetFromRawTarget(rawTarget string) (numTargets int) {
func (r *Runner) countTargetFromRawTarget(rawTarget string) (numTargets int, err error) {
if rawTarget == "" {
return 0
return 0, nil
}
if _, ok := r.hm.Get(rawTarget); ok {
return 0
return 0, nil
}

expandedTarget := 0
Expand All @@ -606,14 +628,17 @@ func (r *Runner) countTargetFromRawTarget(rawTarget string) (numTargets int) {
expandedTarget = int(ipsCount)
}
case asn.IsASN(rawTarget):
cidrs, _ := asn.GetCIDRsForASNNum(rawTarget)
cidrs, err := asn.GetCIDRsForASNNum(rawTarget)
if err != nil {
return 0, err
}
for _, cidr := range cidrs {
expandedTarget += int(mapcidr.AddressCountIpnet(cidr))
}
default:
expandedTarget = 1
}
return expandedTarget
return expandedTarget, nil
}

var (
Expand Down Expand Up @@ -884,6 +909,11 @@ func (r *Runner) RunEnumeration() {
logFilteredErrorPage(r.options.OutputFilterErrorPagePath, resp.URL)
continue
}

if r.options.FilterOutDuplicates && r.duplicate(&resp) {
continue
}

if len(r.options.filterStatusCode) > 0 && sliceutil.Contains(r.options.filterStatusCode, resp.StatusCode) {
continue
}
Expand Down
37 changes: 27 additions & 10 deletions runner/runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import (

_ "github.com/projectdiscovery/fdmax/autofdmax"
"github.com/projectdiscovery/httpx/common/httpx"
"github.com/projectdiscovery/mapcidr/asn"
stringsutil "github.com/projectdiscovery/utils/strings"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -106,11 +108,17 @@ func TestRunner_asn_targets(t *testing.T) {
for _, ip := range ips {
expected = append(expected, httpx.Target{Host: ip})
}

if _, err := asn.GetIPAddressesAsStream(input); err != nil && stringsutil.ContainsAnyI(err.Error(), "unauthorized: 401") {
t.Skip("skipping asn test due to missing/invalid api key")
return
}

got := []httpx.Target{}
for target := range r.targets(r.hp, input) {
got = append(got, target)
}
require.ElementsMatch(t, expected, got, "could not exepcted output")
require.ElementsMatch(t, expected, got, "could not get expected output")
}

func TestRunner_countTargetFromRawTarget(t *testing.T) {
Expand All @@ -120,32 +128,41 @@ func TestRunner_countTargetFromRawTarget(t *testing.T) {

input := "example.com"
expected := 1
got := r.countTargetFromRawTarget(input)
got, err := r.countTargetFromRawTarget(input)
require.Nil(t, err, "could not count targets")
require.Equal(t, expected, got, "got wrong output")

input = "example.com"
expected = 0
err = r.hm.Set(input, nil)
require.Nil(t, err, "could not set value to hm")
got = r.countTargetFromRawTarget(input)
got, err = r.countTargetFromRawTarget(input)
require.Nil(t, err, "could not count targets")
require.Equal(t, expected, got, "got wrong output")

input = "173.0.84.0/24"
expected = 256
got, err = r.countTargetFromRawTarget(input)
require.Nil(t, err, "could not count targets")
require.Equal(t, expected, got, "got wrong output")

input = ""
expected = 0
got = r.countTargetFromRawTarget(input)
got, err = r.countTargetFromRawTarget(input)
require.Nil(t, err, "could not count targets")
require.Equal(t, expected, got, "got wrong output")

if os.Getenv("PDCP_API_KEY") != "" {
input = "AS14421"
expected = 256
got = r.countTargetFromRawTarget(input)
got, err = r.countTargetFromRawTarget(input)
if err != nil && stringsutil.ContainsAnyI(err.Error(), "unauthorized: 401") {
t.Skip("skipping asn test due to missing/invalid api key")
return
}
require.Nil(t, err, "could not count targets")
require.Equal(t, expected, got, "got wrong output")
}

input = "173.0.84.0/24"
expected = 256
got = r.countTargetFromRawTarget(input)
require.Equal(t, expected, got, "got wrong output")
}

func TestRunner_urlWithComma_targets(t *testing.T) {
Expand Down