From cd9bfa2d9fd39c60861aa36a421700177910d094 Mon Sep 17 00:00:00 2001 From: Nick Doiron Date: Thu, 26 May 2016 16:44:55 +0700 Subject: [PATCH] Scan CSS style tags and stylesheet links --- protocol/http_scanner.go | 12 ++++++++++-- scans/scanner.go | 1 + scans/standard-page-scan.go | 27 ++++++++++++++++++++------- utils/url_parsing.go | 17 ++++++++++++++++- 4 files changed, 47 insertions(+), 10 deletions(-) diff --git a/protocol/http_scanner.go b/protocol/http_scanner.go index 1d9c0ae..29396e4 100644 --- a/protocol/http_scanner.go +++ b/protocol/http_scanner.go @@ -76,15 +76,23 @@ func (hps *HTTPProtocolScanner) ScanProtocol(hiddenService string, onionscanConf } func (hps *HTTPProtocolScanner) ScanPage(hiddenService string, page string, report *report.OnionScanReport, f func(scans.Scanner, string, int, string, *report.OnionScanReport)) { + _, contents, responseCode := hps.ScrapePage(hiddenService, page) + f(hps, page, responseCode, string(contents), report) +} + +func (hps *HTTPProtocolScanner) ScrapePage(hiddenService string, page string) (error, []byte, int) { if !strings.Contains(page, utils.WithoutSubdomains(hiddenService)) { + if !strings.HasPrefix(page, "/") { + page = "/" + page + } page = hiddenService + page } response, err := hps.Client.Get("http://" + page) if err != nil { log.Printf("Error connecting to http://%s %s\n", page, err) - return + return err, nil, -1 } defer response.Body.Close() contents, _ := ioutil.ReadAll(response.Body) - f(hps, page, response.StatusCode, string(contents), report) + return nil, contents, response.StatusCode } diff --git a/scans/scanner.go b/scans/scanner.go index 61c7e48..3c38a13 100644 --- a/scans/scanner.go +++ b/scans/scanner.go @@ -6,4 +6,5 @@ import ( type Scanner interface { ScanPage(string, string, *report.OnionScanReport, func(Scanner, string, int, string, *report.OnionScanReport)) + ScrapePage(string, string) (error, []byte, int) } diff --git a/scans/standard-page-scan.go b/scans/standard-page-scan.go index 35de25c..3679e87 100644 --- a/scans/standard-page-scan.go +++ b/scans/standard-page-scan.go @@ -34,7 +34,8 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re new(PGPContentScan).ScanContent(contents, report) log.Printf("\tScanning for Images\n") - domains := utils.ExtractDomains(contents) + var domains []string + var cssLinks []string // parser based on http://schier.co/blog/2015/04/26/a-simple-web-scraper-in-go.html z := html.NewTokenizer(strings.NewReader(contents)) @@ -49,8 +50,8 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re // TODO: don't crawl links with nofollow if tt == html.StartTagToken { - isLink := t.Data == "a" - if isLink { + // links + if t.Data == "a" { linkUrl := utils.GetAttribute(t, "href") if len(linkUrl) > 1 { domains = append(domains, linkUrl) @@ -58,8 +59,13 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re } } - isImage := t.Data == "img" - if isImage { + // css + if t.Data == "link" && utils.GetAttribute(t, "rel") == "stylesheet" { + cssLinks = append(cssLinks, utils.GetAttribute(t, "href")) + } + + // images + if t.Data == "img" { imageUrl := utils.GetAttribute(t, "src") baseUrl, _ := url.Parse(imageUrl) @@ -72,8 +78,15 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re } } - log.Printf("\tScanning for Links\n") + log.Printf("\tScanning for CSS Fonts and Background Images\n") + for _, cssUrl := range cssLinks { + log.Printf("\tScanning CSS file: %s\n", cssUrl) + _, cssContents, _ := scan.ScrapePage(report.HiddenService, utils.WithoutProtocol(cssUrl)) + domains = append(domains, utils.ExtractDomains(string(cssContents))[0:]...) + } + log.Printf("\tScanning for Links\n") + domains = append(domains, utils.ExtractDomains(contents)...) for _, domain := range domains { baseUrl, _ := url.Parse(domain) if baseUrl.Host != "" && utils.WithoutSubdomains(baseUrl.Host) != utils.WithoutSubdomains(report.HiddenService) { @@ -95,7 +108,7 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re foundPaths := r.FindAllStringSubmatch(string(contents), -1) for _, regexpResults := range foundPaths { path := regexpResults[2] - if strings.HasPrefix(path, "http") && !strings.Contains(path, utils.WithoutSubdomains(report.HiddenService)) { + if (strings.HasPrefix(path, "http") || strings.HasPrefix(path, "//")) && !strings.Contains(path, utils.WithoutSubdomains(report.HiddenService)) { continue } diff --git a/utils/url_parsing.go b/utils/url_parsing.go index 3765939..4c40876 100644 --- a/utils/url_parsing.go +++ b/utils/url_parsing.go @@ -2,11 +2,23 @@ package utils import ( "github.com/mvdan/xurls" + "regexp" "strings" ) func ExtractDomains(content string) []string { - return xurls.Strict.FindAllString(content, -1) + domains := xurls.Strict.FindAllString(content, -1) + cssurlregex := regexp.MustCompile(`(?i)url\((.*?)\)`) + cssDomains := cssurlregex.FindAllString(content, -1) + for _, cssDomain := range cssDomains { + if strings.HasPrefix(strings.ToLower(cssDomain), "url(") { + cssDomain = cssDomain[4 : len(cssDomain)-1] + } + if !strings.HasSuffix(cssDomain, ":before") && !strings.HasSuffix(cssDomain, ":after") { + domains = append(domains, cssDomain) + } + } + return domains } func WithoutSubdomains(urlhost string) string { @@ -25,5 +37,8 @@ func WithoutProtocol(url string) string { if strings.HasPrefix(url, "https://") { return url[8:] } + if strings.HasPrefix(url, "//") { + return url[2:] + } return url }