Extract all URLs during standard page scan.

Currently not pulled through into reporting.
2016-04-12 22:03:39 -07:00 · 2016-04-12 22:03:39 -07:00 · a0ae46ca31
parent d059be932a
commit a0ae46ca31
3 changed files with 24 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -8,6 +8,7 @@ deanonymize.

 * h12.me/socks - For the Tor SOCKS Proxy connection.
 * github.com/xiam/exif - For EXIF data extraction.
+* github.com/mvdan/xurls - For some URL parsing.

 ## OS Package Dependencies

--- a/scans/standard-page-scan.go
+++ b/scans/standard-page-scan.go
@ -2,8 +2,10 @@ package scans

 import (
 	"github.com/s-rah/onionscan/report"
+	"github.com/s-rah/onionscan/utils"
 	"log"
 	"regexp"
+	"strings"
 )

 func StandardPageScan(scan Scanner, page string, status int, contents string, report *report.OnionScanReport) {
@ -11,6 +13,20 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re
 	if status == 200 {
 		log.Printf("\tPage %s%s is Accessible\n", report.HiddenService, page)

+		domains := utils.ExtractDomains(contents)
+
+		for _,domain := range domains {
+			if !strings.HasPrefix(domain, "http://"+report.HiddenService) {
+				log.Printf("Found Related URL %s\n", domain)
+				// TODO: Lots of information here which needs to be processed.
+				// * Links to standard sites - google / bitpay etc.
+				// * Links to other onion sites
+				// * Links to obscure clearnet sites.
+			} else {
+				// * Process Internal links
+			}
+		} 
+
 		log.Printf("\tScanning for Images\n")
 		r := regexp.MustCompile("src=\"(" + "http://" + report.HiddenService + "/)?((.*?\\.jpg)|(.*?\\.png)|(.*?\\.jpeg)|(.*?\\.gif))\"")
 		foundImages := r.FindAllStringSubmatch(string(contents), -1)
--- a/utils/useful_regexps.go
+++ b/utils/useful_regexps.go
@ -0,0 +1,7 @@
+package utils
+
+import "github.com/mvdan/xurls"
+
+func ExtractDomains(content string) []string {
+	return xurls.Strict.FindAllString(content, -1)
+}