From 6c01292ec0312401a7e0714df119dbe2faf658d4 Mon Sep 17 00:00:00 2001 From: Nick Doiron Date: Tue, 24 May 2016 17:13:00 +0700 Subject: [PATCH] Find images and links by parsing HTML; support HTTPS and subdomains --- onionscan.go | 6 ++-- protocol/http_scanner.go | 7 ++-- report/onionscanreport.go | 6 ++++ scans/standard-page-scan.go | 66 ++++++++++++++++++++++++++++--------- utils/html_parsing.go | 12 +++++++ utils/url_parsing.go | 29 ++++++++++++++++ utils/useful_regexps.go | 7 ---- 7 files changed, 104 insertions(+), 29 deletions(-) create mode 100644 utils/html_parsing.go create mode 100644 utils/url_parsing.go delete mode 100644 utils/useful_regexps.go diff --git a/onionscan.go b/onionscan.go index b915a67..06bf2d8 100644 --- a/onionscan.go +++ b/onionscan.go @@ -6,6 +6,7 @@ import ( "github.com/s-rah/onionscan/config" "github.com/s-rah/onionscan/protocol" "github.com/s-rah/onionscan/report" + "github.com/s-rah/onionscan/utils" "strings" ) @@ -16,10 +17,7 @@ type OnionScan struct { func (os *OnionScan) Scan(hiddenService string) (*report.OnionScanReport, error) { // Remove Extra Prefix - // TODO: Add support for HTTPS? - if strings.HasPrefix(hiddenService, "http://") { - hiddenService = hiddenService[7:] - } + hiddenService = utils.WithoutProtocol(hiddenService) if strings.HasSuffix(hiddenService, "/") { hiddenService = hiddenService[0 : len(hiddenService)-1] diff --git a/protocol/http_scanner.go b/protocol/http_scanner.go index 0210680..1d9c0ae 100644 --- a/protocol/http_scanner.go +++ b/protocol/http_scanner.go @@ -76,9 +76,12 @@ func (hps *HTTPProtocolScanner) ScanProtocol(hiddenService string, onionscanConf } func (hps *HTTPProtocolScanner) ScanPage(hiddenService string, page string, report *report.OnionScanReport, f func(scans.Scanner, string, int, string, *report.OnionScanReport)) { - response, err := hps.Client.Get("http://" + hiddenService + page) + if !strings.Contains(page, utils.WithoutSubdomains(hiddenService)) { + page = hiddenService + page + } + response, err := hps.Client.Get("http://" + page) if err != nil { - log.Printf("Error connecting to %s%s %s\n", hiddenService, page, err) + log.Printf("Error connecting to http://%s %s\n", page, err) return } defer response.Body.Close() diff --git a/report/onionscanreport.go b/report/onionscanreport.go index ab18941..27f5097 100644 --- a/report/onionscanreport.go +++ b/report/onionscanreport.go @@ -33,6 +33,7 @@ type OnionScanReport struct { RelatedOnionServices []string `json:"relatedOnionServices"` RelatedClearnetDomains []string `json:"relatedOnionDomains"` LinkedSites []string `json:"linkedSites"` + InternalPages []string `json:"InternalPages"` IP []string `json:"ipAddresses"` OpenDirectories []string `json:"openDirectories"` ExifImages []ExifImage `json:"exifImages"` @@ -86,6 +87,11 @@ func (osr *OnionScanReport) AddLinkedSite(site string) { utils.RemoveDuplicates(&osr.LinkedSites) } +func (osr *OnionScanReport) AddInternalPage(site string) { + osr.InternalPages = append(osr.InternalPages, site) + utils.RemoveDuplicates(&osr.InternalPages) +} + func (osr *OnionScanReport) AddPGPKey(key string) { osr.PGPKeys = append(osr.PGPKeys, key) utils.RemoveDuplicates(&osr.PGPKeys) diff --git a/scans/standard-page-scan.go b/scans/standard-page-scan.go index 4ea9b4b..35de25c 100644 --- a/scans/standard-page-scan.go +++ b/scans/standard-page-scan.go @@ -5,6 +5,7 @@ import ( "encoding/hex" "github.com/s-rah/onionscan/report" "github.com/s-rah/onionscan/utils" + "golang.org/x/net/html" "log" "net/url" "regexp" @@ -12,9 +13,9 @@ import ( ) func StandardPageScan(scan Scanner, page string, status int, contents string, report *report.OnionScanReport) { - log.Printf("Scanning %s%s\n", report.HiddenService, page) + log.Printf("Scanning %s\n", page) if status == 200 { - log.Printf("\tPage %s%s is Accessible\n", report.HiddenService, page) + log.Printf("\tPage %s is Accessible\n", page) hash := sha1.Sum([]byte(contents)) report.Hashes = append(report.Hashes, hex.EncodeToString(hash[:])) @@ -31,44 +32,77 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re } new(PGPContentScan).ScanContent(contents, report) + + log.Printf("\tScanning for Images\n") domains := utils.ExtractDomains(contents) + // parser based on http://schier.co/blog/2015/04/26/a-simple-web-scraper-in-go.html + z := html.NewTokenizer(strings.NewReader(contents)) + for { + tt := z.Next() + if tt == html.ErrorToken { + break + } + t := z.Token() + + // check for an href and src attributes + // TODO: don't crawl links with nofollow + + if tt == html.StartTagToken { + isLink := t.Data == "a" + if isLink { + linkUrl := utils.GetAttribute(t, "href") + if len(linkUrl) > 1 { + domains = append(domains, linkUrl) + } + } + } + + isImage := t.Data == "img" + if isImage { + imageUrl := utils.GetAttribute(t, "src") + + baseUrl, _ := url.Parse(imageUrl) + if utils.WithoutSubdomains(baseUrl.Host) == utils.WithoutSubdomains(report.HiddenService) { + scan.ScanPage(report.HiddenService, utils.WithoutProtocol(imageUrl), report, CheckExif) + log.Printf("\t Found internal image %s\n", imageUrl) + } else { + log.Printf("\t Not scanning remote image %s\n", imageUrl) + } + } + } + + log.Printf("\tScanning for Links\n") + for _, domain := range domains { - if !strings.HasPrefix(domain, "http://"+report.HiddenService) { + baseUrl, _ := url.Parse(domain) + if baseUrl.Host != "" && utils.WithoutSubdomains(baseUrl.Host) != utils.WithoutSubdomains(report.HiddenService) { log.Printf("Found Related URL %s\n", domain) // TODO: Lots of information here which needs to be processed. // * Links to standard sites - google / bitpay etc. // * Links to other onion sites // * Links to obscure clearnet sites. - baseUrl, _ := url.Parse(domain) report.AddLinkedSite(baseUrl.Host) } else { - // * Process FQDN internal links (unlikly) + // * Process FQDN internal links log.Printf("Found Internal URL %s\n", domain) + report.AddInternalPage(baseUrl.Host) } } - log.Printf("\tScanning for Images\n") - r := regexp.MustCompile("src=\"(" + "http://" + report.HiddenService + "/)?((.*?\\.jpg)|(.*?\\.png)|(.*?\\.jpeg)|(.*?\\.gif))\"") - foundImages := r.FindAllStringSubmatch(string(contents), -1) - for _, image := range foundImages { - log.Printf("\t Found image %s\n", image[2]) - scan.ScanPage(report.HiddenService, "/"+image[2], report, CheckExif) - } - log.Printf("\tScanning for Referenced Directories\n") - r = regexp.MustCompile("(src|href)=\"([^\"]*)\"") + r := regexp.MustCompile("(src|href)=\"([^\"]*)\"") foundPaths := r.FindAllStringSubmatch(string(contents), -1) for _, regexpResults := range foundPaths { path := regexpResults[2] - if strings.HasPrefix(path, "http") { + if strings.HasPrefix(path, "http") && !strings.Contains(path, utils.WithoutSubdomains(report.HiddenService)) { continue } term := strings.LastIndex(path, "/") if term > 0 { log.Printf("\t Found Referenced Directory %s\n", path[:term]) - report.AddPageReferencedDirectory(path[:term]) + report.AddPageReferencedDirectory(utils.WithoutProtocol(path[:term])) } } } else if status == 403 { diff --git a/utils/html_parsing.go b/utils/html_parsing.go new file mode 100644 index 0000000..24acd42 --- /dev/null +++ b/utils/html_parsing.go @@ -0,0 +1,12 @@ +package utils + +import "golang.org/x/net/html" + +func GetAttribute(tag html.Token, name string) string { + for _, a := range tag.Attr { + if a.Key == name { + return a.Val + } + } + return "" +} diff --git a/utils/url_parsing.go b/utils/url_parsing.go new file mode 100644 index 0000000..3765939 --- /dev/null +++ b/utils/url_parsing.go @@ -0,0 +1,29 @@ +package utils + +import ( + "github.com/mvdan/xurls" + "strings" +) + +func ExtractDomains(content string) []string { + return xurls.Strict.FindAllString(content, -1) +} + +func WithoutSubdomains(urlhost string) string { + urlParts := strings.Split(urlhost, ".") + if len(urlParts) < 2 { + return "" + } else { + return strings.Join(urlParts[len(urlParts)-2:], ".") + } +} + +func WithoutProtocol(url string) string { + if strings.HasPrefix(url, "http://") { + return url[7:] + } + if strings.HasPrefix(url, "https://") { + return url[8:] + } + return url +} diff --git a/utils/useful_regexps.go b/utils/useful_regexps.go deleted file mode 100644 index 19a13d1..0000000 --- a/utils/useful_regexps.go +++ /dev/null @@ -1,7 +0,0 @@ -package utils - -import "github.com/mvdan/xurls" - -func ExtractDomains(content string) []string { - return xurls.Strict.FindAllString(content, -1) -}