From a0ae46ca31af9139062a1b94b296bbd99f6ef48f Mon Sep 17 00:00:00 2001 From: Sarah Jamie Lewis Date: Tue, 12 Apr 2016 22:03:39 -0700 Subject: [PATCH] Extract all URLs during standard page scan. Currently not pulled through into reporting. --- README.md | 1 + scans/standard-page-scan.go | 16 ++++++++++++++++ utils/useful_regexps.go | 7 +++++++ 3 files changed, 24 insertions(+) create mode 100644 utils/useful_regexps.go diff --git a/README.md b/README.md index 2f7a8b4..d408064 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ deanonymize. * h12.me/socks - For the Tor SOCKS Proxy connection. * github.com/xiam/exif - For EXIF data extraction. +* github.com/mvdan/xurls - For some URL parsing. ## OS Package Dependencies diff --git a/scans/standard-page-scan.go b/scans/standard-page-scan.go index 13f8e30..dd20684 100644 --- a/scans/standard-page-scan.go +++ b/scans/standard-page-scan.go @@ -2,8 +2,10 @@ package scans import ( "github.com/s-rah/onionscan/report" + "github.com/s-rah/onionscan/utils" "log" "regexp" + "strings" ) func StandardPageScan(scan Scanner, page string, status int, contents string, report *report.OnionScanReport) { @@ -11,6 +13,20 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re if status == 200 { log.Printf("\tPage %s%s is Accessible\n", report.HiddenService, page) + domains := utils.ExtractDomains(contents) + + for _,domain := range domains { + if !strings.HasPrefix(domain, "http://"+report.HiddenService) { + log.Printf("Found Related URL %s\n", domain) + // TODO: Lots of information here which needs to be processed. + // * Links to standard sites - google / bitpay etc. + // * Links to other onion sites + // * Links to obscure clearnet sites. + } else { + // * Process Internal links + } + } + log.Printf("\tScanning for Images\n") r := regexp.MustCompile("src=\"(" + "http://" + report.HiddenService + "/)?((.*?\\.jpg)|(.*?\\.png)|(.*?\\.jpeg)|(.*?\\.gif))\"") foundImages := r.FindAllStringSubmatch(string(contents), -1) diff --git a/utils/useful_regexps.go b/utils/useful_regexps.go new file mode 100644 index 0000000..19a13d1 --- /dev/null +++ b/utils/useful_regexps.go @@ -0,0 +1,7 @@ +package utils + +import "github.com/mvdan/xurls" + +func ExtractDomains(content string) []string { + return xurls.Strict.FindAllString(content, -1) +}