Merge pull request #38 from mapmeld/master

Find images and links by parsing HTML
2016-05-25 19:29:41 -07:00 · 2016-05-25 19:29:41 -07:00 · 5b0e733da3
parent 6a06edf59d 6c01292ec0
commit 5b0e733da3
7 changed files with 104 additions and 29 deletions
--- a/onionscan.go
+++ b/onionscan.go
@ -6,6 +6,7 @@ import (
 	"github.com/s-rah/onionscan/config"
 	"github.com/s-rah/onionscan/protocol"
 	"github.com/s-rah/onionscan/report"
+	"github.com/s-rah/onionscan/utils"
 	"strings"
 )

@ -16,10 +17,7 @@ type OnionScan struct {
 func (os *OnionScan) Scan(hiddenService string) (*report.OnionScanReport, error) {

 	// Remove Extra Prefix
-	// TODO: Add support for HTTPS?
-	if strings.HasPrefix(hiddenService, "http://") {
-		hiddenService = hiddenService[7:]
-	}
+	hiddenService = utils.WithoutProtocol(hiddenService)

 	if strings.HasSuffix(hiddenService, "/") {
 		hiddenService = hiddenService[0 : len(hiddenService)-1]
--- a/protocol/http_scanner.go
+++ b/protocol/http_scanner.go
@ -76,9 +76,12 @@ func (hps *HTTPProtocolScanner) ScanProtocol(hiddenService string, onionscanConf
 }

 func (hps *HTTPProtocolScanner) ScanPage(hiddenService string, page string, report *report.OnionScanReport, f func(scans.Scanner, string, int, string, *report.OnionScanReport)) {
-	response, err := hps.Client.Get("http://" + hiddenService + page)
+	if !strings.Contains(page, utils.WithoutSubdomains(hiddenService)) {
+		page = hiddenService + page
+	}
+	response, err := hps.Client.Get("http://" + page)
 	if err != nil {
-		log.Printf("Error connecting to %s%s %s\n", hiddenService, page, err)
+		log.Printf("Error connecting to http://%s %s\n", page, err)
 		return
 	}
 	defer response.Body.Close()
--- a/report/onionscanreport.go
+++ b/report/onionscanreport.go
@ -33,6 +33,7 @@ type OnionScanReport struct {
 	RelatedOnionServices      []string    `json:"relatedOnionServices"`
 	RelatedClearnetDomains    []string    `json:"relatedOnionDomains"`
 	LinkedSites               []string    `json:"linkedSites"`
+	InternalPages             []string    `json:"InternalPages"`
 	IP                        []string    `json:"ipAddresses"`
 	OpenDirectories           []string    `json:"openDirectories"`
 	ExifImages                []ExifImage `json:"exifImages"`
@ -86,6 +87,11 @@ func (osr *OnionScanReport) AddLinkedSite(site string) {
 	utils.RemoveDuplicates(&osr.LinkedSites)
 }

+func (osr *OnionScanReport) AddInternalPage(site string) {
+	osr.InternalPages = append(osr.InternalPages, site)
+	utils.RemoveDuplicates(&osr.InternalPages)
+}
+
 func (osr *OnionScanReport) AddPGPKey(key string) {
 	osr.PGPKeys = append(osr.PGPKeys, key)
 	utils.RemoveDuplicates(&osr.PGPKeys)
--- a/scans/standard-page-scan.go
+++ b/scans/standard-page-scan.go
@ -5,6 +5,7 @@ import (
 	"encoding/hex"
 	"github.com/s-rah/onionscan/report"
 	"github.com/s-rah/onionscan/utils"
+	"golang.org/x/net/html"
 	"log"
 	"net/url"
 	"regexp"
@ -12,9 +13,9 @@ import (
 )

 func StandardPageScan(scan Scanner, page string, status int, contents string, report *report.OnionScanReport) {
-	log.Printf("Scanning %s%s\n", report.HiddenService, page)
+	log.Printf("Scanning %s\n", page)
 	if status == 200 {
-		log.Printf("\tPage %s%s is Accessible\n", report.HiddenService, page)
+		log.Printf("\tPage %s is Accessible\n", page)

 		hash := sha1.Sum([]byte(contents))
 		report.Hashes = append(report.Hashes, hex.EncodeToString(hash[:]))
@ -31,44 +32,77 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re
 		}

 		new(PGPContentScan).ScanContent(contents, report)
+
+		log.Printf("\tScanning for Images\n")
 		domains := utils.ExtractDomains(contents)

+		// parser based on http://schier.co/blog/2015/04/26/a-simple-web-scraper-in-go.html
+		z := html.NewTokenizer(strings.NewReader(contents))
+		for {
+			tt := z.Next()
+			if tt == html.ErrorToken {
+				break
+			}
+			t := z.Token()
+
+			// check for an href and src attributes
+			// TODO: don't crawl links with nofollow
+
+			if tt == html.StartTagToken {
+				isLink := t.Data == "a"
+				if isLink {
+					linkUrl := utils.GetAttribute(t, "href")
+					if len(linkUrl) > 1 {
+						domains = append(domains, linkUrl)
+					}
+				}
+			}
+
+			isImage := t.Data == "img"
+			if isImage {
+				imageUrl := utils.GetAttribute(t, "src")
+
+				baseUrl, _ := url.Parse(imageUrl)
+				if utils.WithoutSubdomains(baseUrl.Host) == utils.WithoutSubdomains(report.HiddenService) {
+					scan.ScanPage(report.HiddenService, utils.WithoutProtocol(imageUrl), report, CheckExif)
+					log.Printf("\t Found internal image %s\n", imageUrl)
+				} else {
+					log.Printf("\t Not scanning remote image %s\n", imageUrl)
+				}
+			}
+		}
+
+		log.Printf("\tScanning for Links\n")
+
 		for _, domain := range domains {
-			if !strings.HasPrefix(domain, "http://"+report.HiddenService) {
+			baseUrl, _ := url.Parse(domain)
+			if baseUrl.Host != "" && utils.WithoutSubdomains(baseUrl.Host) != utils.WithoutSubdomains(report.HiddenService) {
 				log.Printf("Found Related URL %s\n", domain)
 				// TODO: Lots of information here which needs to be processed.
 				// * Links to standard sites - google / bitpay etc.
 				// * Links to other onion sites
 				// * Links to obscure clearnet sites.
-				baseUrl, _ := url.Parse(domain)
 				report.AddLinkedSite(baseUrl.Host)
 			} else {
-				// * Process FQDN internal links (unlikly)
+				// * Process FQDN internal links
 				log.Printf("Found Internal URL %s\n", domain)
+				report.AddInternalPage(baseUrl.Host)
 			}
 		}

-		log.Printf("\tScanning for Images\n")
-		r := regexp.MustCompile("src=\"(" + "http://" + report.HiddenService + "/)?((.*?\\.jpg)|(.*?\\.png)|(.*?\\.jpeg)|(.*?\\.gif))\"")
-		foundImages := r.FindAllStringSubmatch(string(contents), -1)
-		for _, image := range foundImages {
-			log.Printf("\t Found image %s\n", image[2])
-			scan.ScanPage(report.HiddenService, "/"+image[2], report, CheckExif)
-		}
-
 		log.Printf("\tScanning for Referenced Directories\n")
-		r = regexp.MustCompile("(src|href)=\"([^\"]*)\"")
+		r := regexp.MustCompile("(src|href)=\"([^\"]*)\"")
 		foundPaths := r.FindAllStringSubmatch(string(contents), -1)
 		for _, regexpResults := range foundPaths {
 			path := regexpResults[2]
-			if strings.HasPrefix(path, "http") {
+			if strings.HasPrefix(path, "http") && !strings.Contains(path, utils.WithoutSubdomains(report.HiddenService)) {
 				continue
 			}

 			term := strings.LastIndex(path, "/")
 			if term > 0 {
 				log.Printf("\t Found Referenced Directory %s\n", path[:term])
-				report.AddPageReferencedDirectory(path[:term])
+				report.AddPageReferencedDirectory(utils.WithoutProtocol(path[:term]))
 			}
 		}
 	} else if status == 403 {
--- a/utils/html_parsing.go
+++ b/utils/html_parsing.go
@ -0,0 +1,12 @@
+package utils
+
+import "golang.org/x/net/html"
+
+func GetAttribute(tag html.Token, name string) string {
+	for _, a := range tag.Attr {
+		if a.Key == name {
+			return a.Val
+		}
+	}
+	return ""
+}
--- a/utils/url_parsing.go
+++ b/utils/url_parsing.go
@ -0,0 +1,29 @@
+package utils
+
+import (
+	"github.com/mvdan/xurls"
+	"strings"
+)
+
+func ExtractDomains(content string) []string {
+	return xurls.Strict.FindAllString(content, -1)
+}
+
+func WithoutSubdomains(urlhost string) string {
+	urlParts := strings.Split(urlhost, ".")
+	if len(urlParts) < 2 {
+		return ""
+	} else {
+		return strings.Join(urlParts[len(urlParts)-2:], ".")
+	}
+}
+
+func WithoutProtocol(url string) string {
+	if strings.HasPrefix(url, "http://") {
+		return url[7:]
+	}
+	if strings.HasPrefix(url, "https://") {
+		return url[8:]
+	}
+	return url
+}
--- a/utils/useful_regexps.go
+++ b/utils/useful_regexps.go
@ -1,7 +0,0 @@
-package utils
-
-import "github.com/mvdan/xurls"
-
-func ExtractDomains(content string) []string {
-	return xurls.Strict.FindAllString(content, -1)
-}