Merge pull request #41 from mapmeld/master
Scan CSS style tags and stylesheet links
This commit is contained in:
commit
49095c586a
|
@ -76,15 +76,23 @@ func (hps *HTTPProtocolScanner) ScanProtocol(hiddenService string, onionscanConf
|
||||||
}
|
}
|
||||||
|
|
||||||
func (hps *HTTPProtocolScanner) ScanPage(hiddenService string, page string, report *report.OnionScanReport, f func(scans.Scanner, string, int, string, *report.OnionScanReport)) {
|
func (hps *HTTPProtocolScanner) ScanPage(hiddenService string, page string, report *report.OnionScanReport, f func(scans.Scanner, string, int, string, *report.OnionScanReport)) {
|
||||||
|
_, contents, responseCode := hps.ScrapePage(hiddenService, page)
|
||||||
|
f(hps, page, responseCode, string(contents), report)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (hps *HTTPProtocolScanner) ScrapePage(hiddenService string, page string) (error, []byte, int) {
|
||||||
if !strings.Contains(page, utils.WithoutSubdomains(hiddenService)) {
|
if !strings.Contains(page, utils.WithoutSubdomains(hiddenService)) {
|
||||||
|
if !strings.HasPrefix(page, "/") {
|
||||||
|
page = "/" + page
|
||||||
|
}
|
||||||
page = hiddenService + page
|
page = hiddenService + page
|
||||||
}
|
}
|
||||||
response, err := hps.Client.Get("http://" + page)
|
response, err := hps.Client.Get("http://" + page)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Error connecting to http://%s %s\n", page, err)
|
log.Printf("Error connecting to http://%s %s\n", page, err)
|
||||||
return
|
return err, nil, -1
|
||||||
}
|
}
|
||||||
defer response.Body.Close()
|
defer response.Body.Close()
|
||||||
contents, _ := ioutil.ReadAll(response.Body)
|
contents, _ := ioutil.ReadAll(response.Body)
|
||||||
f(hps, page, response.StatusCode, string(contents), report)
|
return nil, contents, response.StatusCode
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,4 +6,5 @@ import (
|
||||||
|
|
||||||
type Scanner interface {
|
type Scanner interface {
|
||||||
ScanPage(string, string, *report.OnionScanReport, func(Scanner, string, int, string, *report.OnionScanReport))
|
ScanPage(string, string, *report.OnionScanReport, func(Scanner, string, int, string, *report.OnionScanReport))
|
||||||
|
ScrapePage(string, string) (error, []byte, int)
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,7 +34,8 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re
|
||||||
new(PGPContentScan).ScanContent(contents, report)
|
new(PGPContentScan).ScanContent(contents, report)
|
||||||
|
|
||||||
log.Printf("\tScanning for Images\n")
|
log.Printf("\tScanning for Images\n")
|
||||||
domains := utils.ExtractDomains(contents)
|
var domains []string
|
||||||
|
var cssLinks []string
|
||||||
|
|
||||||
// parser based on http://schier.co/blog/2015/04/26/a-simple-web-scraper-in-go.html
|
// parser based on http://schier.co/blog/2015/04/26/a-simple-web-scraper-in-go.html
|
||||||
z := html.NewTokenizer(strings.NewReader(contents))
|
z := html.NewTokenizer(strings.NewReader(contents))
|
||||||
|
@ -49,8 +50,8 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re
|
||||||
// TODO: don't crawl links with nofollow
|
// TODO: don't crawl links with nofollow
|
||||||
|
|
||||||
if tt == html.StartTagToken {
|
if tt == html.StartTagToken {
|
||||||
isLink := t.Data == "a"
|
// links
|
||||||
if isLink {
|
if t.Data == "a" {
|
||||||
linkUrl := utils.GetAttribute(t, "href")
|
linkUrl := utils.GetAttribute(t, "href")
|
||||||
if len(linkUrl) > 1 {
|
if len(linkUrl) > 1 {
|
||||||
domains = append(domains, linkUrl)
|
domains = append(domains, linkUrl)
|
||||||
|
@ -58,8 +59,13 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
isImage := t.Data == "img"
|
// css <link>
|
||||||
if isImage {
|
if t.Data == "link" && utils.GetAttribute(t, "rel") == "stylesheet" {
|
||||||
|
cssLinks = append(cssLinks, utils.GetAttribute(t, "href"))
|
||||||
|
}
|
||||||
|
|
||||||
|
// images
|
||||||
|
if t.Data == "img" {
|
||||||
imageUrl := utils.GetAttribute(t, "src")
|
imageUrl := utils.GetAttribute(t, "src")
|
||||||
|
|
||||||
baseUrl, _ := url.Parse(imageUrl)
|
baseUrl, _ := url.Parse(imageUrl)
|
||||||
|
@ -72,8 +78,15 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Printf("\tScanning for Links\n")
|
log.Printf("\tScanning for CSS Fonts and Background Images\n")
|
||||||
|
for _, cssUrl := range cssLinks {
|
||||||
|
log.Printf("\tScanning CSS file: %s\n", cssUrl)
|
||||||
|
_, cssContents, _ := scan.ScrapePage(report.HiddenService, utils.WithoutProtocol(cssUrl))
|
||||||
|
domains = append(domains, utils.ExtractDomains(string(cssContents))[0:]...)
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Printf("\tScanning for Links\n")
|
||||||
|
domains = append(domains, utils.ExtractDomains(contents)...)
|
||||||
for _, domain := range domains {
|
for _, domain := range domains {
|
||||||
baseUrl, _ := url.Parse(domain)
|
baseUrl, _ := url.Parse(domain)
|
||||||
if baseUrl.Host != "" && utils.WithoutSubdomains(baseUrl.Host) != utils.WithoutSubdomains(report.HiddenService) {
|
if baseUrl.Host != "" && utils.WithoutSubdomains(baseUrl.Host) != utils.WithoutSubdomains(report.HiddenService) {
|
||||||
|
@ -95,7 +108,7 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re
|
||||||
foundPaths := r.FindAllStringSubmatch(string(contents), -1)
|
foundPaths := r.FindAllStringSubmatch(string(contents), -1)
|
||||||
for _, regexpResults := range foundPaths {
|
for _, regexpResults := range foundPaths {
|
||||||
path := regexpResults[2]
|
path := regexpResults[2]
|
||||||
if strings.HasPrefix(path, "http") && !strings.Contains(path, utils.WithoutSubdomains(report.HiddenService)) {
|
if (strings.HasPrefix(path, "http") || strings.HasPrefix(path, "//")) && !strings.Contains(path, utils.WithoutSubdomains(report.HiddenService)) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,11 +2,23 @@ package utils
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/mvdan/xurls"
|
"github.com/mvdan/xurls"
|
||||||
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
func ExtractDomains(content string) []string {
|
func ExtractDomains(content string) []string {
|
||||||
return xurls.Strict.FindAllString(content, -1)
|
domains := xurls.Strict.FindAllString(content, -1)
|
||||||
|
cssurlregex := regexp.MustCompile(`(?i)url\((.*?)\)`)
|
||||||
|
cssDomains := cssurlregex.FindAllString(content, -1)
|
||||||
|
for _, cssDomain := range cssDomains {
|
||||||
|
if strings.HasPrefix(strings.ToLower(cssDomain), "url(") {
|
||||||
|
cssDomain = cssDomain[4 : len(cssDomain)-1]
|
||||||
|
}
|
||||||
|
if !strings.HasSuffix(cssDomain, ":before") && !strings.HasSuffix(cssDomain, ":after") {
|
||||||
|
domains = append(domains, cssDomain)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return domains
|
||||||
}
|
}
|
||||||
|
|
||||||
func WithoutSubdomains(urlhost string) string {
|
func WithoutSubdomains(urlhost string) string {
|
||||||
|
@ -25,5 +37,8 @@ func WithoutProtocol(url string) string {
|
||||||
if strings.HasPrefix(url, "https://") {
|
if strings.HasPrefix(url, "https://") {
|
||||||
return url[8:]
|
return url[8:]
|
||||||
}
|
}
|
||||||
|
if strings.HasPrefix(url, "//") {
|
||||||
|
return url[2:]
|
||||||
|
}
|
||||||
return url
|
return url
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue