Extract all URLs during standard page scan.

Currently not pulled through into reporting.
2016-04-12 22:03:39 -07:00 · 2016-04-12 22:03:39 -07:00 · a0ae46ca31
parent d059be932a
commit a0ae46ca31
3 changed files with 24 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -8,6 +8,7 @@ deanonymize.
 * h12.me/socks - For the Tor SOCKS Proxy connection.
 * github.com/xiam/exif - For EXIF data extraction.
 * github.com/mvdan/xurls - For some URL parsing.
 ## OS Package Dependencies
--- a/scans/standard-page-scan.go
+++ b/scans/standard-page-scan.go
@ -2,8 +2,10 @@ package scans
 import (
 	"github.com/s-rah/onionscan/report"
 	"github.com/s-rah/onionscan/utils"
 	"log"
 	"regexp"
 	"strings"
 )
 func StandardPageScan(scan Scanner, page string, status int, contents string, report *report.OnionScanReport) {
@ -11,6 +13,20 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re
 	if status == 200 {
 		log.Printf("\tPage %s%s is Accessible\n", report.HiddenService, page)
 		domains := utils.ExtractDomains(contents)
 		for _,domain := range domains {
 			if !strings.HasPrefix(domain, "http://"+report.HiddenService) {
 				log.Printf("Found Related URL %s\n", domain)
 				// TODO: Lots of information here which needs to be processed.
 				// * Links to standard sites - google / bitpay etc.
 				// * Links to other onion sites
 				// * Links to obscure clearnet sites.
 			} else {
 				// * Process Internal links
 			}
 		} 
 		log.Printf("\tScanning for Images\n")
 		r := regexp.MustCompile("src=\"(" + "http://" + report.HiddenService + "/)?((.*?\\.jpg)|(.*?\\.png)|(.*?\\.jpeg)|(.*?\\.gif))\"")
 		foundImages := r.FindAllStringSubmatch(string(contents), -1)
--- a/utils/useful_regexps.go
+++ b/utils/useful_regexps.go
@ -0,0 +1,7 @@
 package utils
 import "github.com/mvdan/xurls"
 func ExtractDomains(content string) []string {
 	return xurls.Strict.FindAllString(content, -1)
 }