onionscan/scans/standard-page-scan.go

package scans

import (
	"crypto/sha1"
	"encoding/hex"
	"github.com/s-rah/onionscan/report"
	"github.com/s-rah/onionscan/utils"
	"golang.org/x/net/html"
	"log"
	"net/url"
	"regexp"
	"strings"
)

func StandardPageScan(scan Scanner, page string, status int, contents string, report *report.OnionScanReport) {
	log.Printf("Scanning %s\n", page)
	if status == 200 {
		log.Printf("\tPage %s is Accessible\n", page)

		hash := sha1.Sum([]byte(contents))
		report.Hashes = append(report.Hashes, hex.EncodeToString(hash[:]))
		report.Snapshot = contents

		// Try resolve page title if present
		isTitlePresent := strings.Contains(contents, "<title>")
		if isTitlePresent {
			var startIndex = strings.Index(contents, "<title>")
			var endIndex = strings.Index(contents, "</title>")
			var pageTitle = contents[startIndex+len("<title>") : endIndex]
			log.Printf("\tPage Title: %s\n", pageTitle)
			report.PageTitle = pageTitle
		}

		new(PGPContentScan).ScanContent(contents, report)

		log.Printf("\tScanning for Images\n")
		var domains []string
		var cssLinks []string

		// parser based on http://schier.co/blog/2015/04/26/a-simple-web-scraper-in-go.html
		z := html.NewTokenizer(strings.NewReader(contents))
		for {
			tt := z.Next()
			if tt == html.ErrorToken {
				break
			}
			t := z.Token()

			// check for an href and src attributes
			// TODO: don't crawl links with nofollow

			if tt == html.StartTagToken {
				// links
				if t.Data == "a" {
					linkUrl := utils.GetAttribute(t, "href")
					if len(linkUrl) > 1 {
						domains = append(domains, linkUrl)
					}
				}
			}

			// css <link>
			if t.Data == "link" && utils.GetAttribute(t, "rel") == "stylesheet" {
				cssLinks = append(cssLinks, utils.GetAttribute(t, "href"))
			}

			// images
			if t.Data == "img" {
				imageUrl := utils.GetAttribute(t, "src")

				baseUrl, _ := url.Parse(imageUrl)
				if utils.WithoutSubdomains(baseUrl.Host) == utils.WithoutSubdomains(report.HiddenService) {
					scan.ScanPage(report.HiddenService, utils.WithoutProtocol(imageUrl), report, CheckExif)
					log.Printf("\t Found internal image %s\n", imageUrl)
				} else {
					log.Printf("\t Not scanning remote image %s\n", imageUrl)
				}
			}
		}

		log.Printf("\tScanning for CSS Fonts and Background Images\n")
		for _, cssUrl := range cssLinks {
			log.Printf("\tScanning CSS file: %s\n", cssUrl)
			_, cssContents, _ := scan.ScrapePage(report.HiddenService, utils.WithoutProtocol(cssUrl))
			domains = append(domains, utils.ExtractDomains(string(cssContents))[0:]...)
		}

		log.Printf("\tScanning for Links\n")
		domains = append(domains, utils.ExtractDomains(contents)...)
		for _, domain := range domains {
			baseUrl, _ := url.Parse(domain)
			if baseUrl.Host != "" && utils.WithoutSubdomains(baseUrl.Host) != utils.WithoutSubdomains(report.HiddenService) {
				log.Printf("Found Related URL %s\n", domain)
				// TODO: Lots of information here which needs to be processed.
				// * Links to standard sites - google / bitpay etc.
				// * Links to other onion sites
				// * Links to obscure clearnet sites.
				report.AddLinkedSite(baseUrl.Host)
			} else {
				// * Process FQDN internal links
				log.Printf("Found Internal URL %s\n", domain)
				report.AddInternalPage(baseUrl.Host)
			}
		}

		log.Printf("\tScanning for Referenced Directories\n")
		r := regexp.MustCompile("(src|href)=\"([^\"]*)\"")
		foundPaths := r.FindAllStringSubmatch(string(contents), -1)
		for _, regexpResults := range foundPaths {
			path := regexpResults[2]
			if (strings.HasPrefix(path, "http") || strings.HasPrefix(path, "//")) && !strings.Contains(path, utils.WithoutSubdomains(report.HiddenService)) {
				continue
			}

			term := strings.LastIndex(path, "/")
			if term > 0 {
				log.Printf("\t Found Referenced Directory %s\n", path[:term])
				report.AddPageReferencedDirectory(utils.WithoutProtocol(path[:term]))
			}
		}
	} else if status == 403 {
		log.Printf("\tPage %s%s is Forbidden\n", report.HiddenService, page)
	} else if status == 404 {
		log.Printf("\tPage %s%s is Does Not Exist\n", report.HiddenService, page)
	}
}
Initial Commit 2016-04-10 00:04:22 +00:00			`package scans`

			`import (`
Pull referenced directories from page scan and scan them along with common directories in http scanner 2016-04-24 17:28:59 +00:00			`"crypto/sha1"`
			`"encoding/hex"`
Initial Commit 2016-04-10 00:04:22 +00:00			`"github.com/s-rah/onionscan/report"`
Extract all URLs during standard page scan. Currently not pulled through into reporting. 2016-04-13 05:03:39 +00:00			`"github.com/s-rah/onionscan/utils"`
Find images and links by parsing HTML; support HTTPS and subdomains 2016-05-24 10:13:00 +00:00			`"golang.org/x/net/html"`
Initial Commit 2016-04-10 00:04:22 +00:00			`"log"`
Pull referenced directories from page scan and scan them along with common directories in http scanner 2016-04-24 17:28:59 +00:00			`"net/url"`
Initial Commit 2016-04-10 00:04:22 +00:00			`"regexp"`
Extract all URLs during standard page scan. Currently not pulled through into reporting. 2016-04-13 05:03:39 +00:00			`"strings"`
Initial Commit 2016-04-10 00:04:22 +00:00			`)`

			`func StandardPageScan(scan Scanner, page string, status int, contents string, report *report.OnionScanReport) {`
Find images and links by parsing HTML; support HTTPS and subdomains 2016-05-24 10:13:00 +00:00			`log.Printf("Scanning %s\n", page)`
Initial Commit 2016-04-10 00:04:22 +00:00			`if status == 200 {`
Find images and links by parsing HTML; support HTTPS and subdomains 2016-05-24 10:13:00 +00:00			`log.Printf("\tPage %s is Accessible\n", page)`
Pull referenced directories from page scan and scan them along with common directories in http scanner 2016-04-24 17:28:59 +00:00
New Protocols Scans, SSH Fingerprinting * SSH Fingerprint * Page Snapshot * A few new Protocol Tests (FTP, SMTP, Ricochet, IRC) 2016-04-25 02:46:28 +00:00			`hash := sha1.Sum([]byte(contents))`
			`report.Hashes = append(report.Hashes, hex.EncodeToString(hash[:]))`
			`report.Snapshot = contents`
Initial Commit 2016-04-10 00:04:22 +00:00
Improving Standard Page Scan 2016-04-25 09:29:27 +00:00			`// Try resolve page title if present`
			`isTitlePresent := strings.Contains(contents, "<title>")`
			`if isTitlePresent {`
			`var startIndex = strings.Index(contents, "<title>")`
			`var endIndex = strings.Index(contents, "</title>")`
go fmt 2016-04-27 00:47:00 +00:00			`var pageTitle = contents[startIndex+len("<title>") : endIndex]`
Improving Standard Page Scan 2016-04-25 09:29:27 +00:00			`log.Printf("\tPage Title: %s\n", pageTitle)`
			`report.PageTitle = pageTitle`
			`}`

Extracting PGP Keys from Pages Also fixes a bug with reporting of headers. 2016-05-10 01:42:19 +00:00			`new(PGPContentScan).ScanContent(contents, report)`
Find images and links by parsing HTML; support HTTPS and subdomains 2016-05-24 10:13:00 +00:00
			`log.Printf("\tScanning for Images\n")`
Scan CSS style tags and stylesheet links 2016-05-26 09:44:55 +00:00			`var domains []string`
			`var cssLinks []string`
Pull referenced directories from page scan and scan them along with common directories in http scanner 2016-04-24 17:28:59 +00:00
Find images and links by parsing HTML; support HTTPS and subdomains 2016-05-24 10:13:00 +00:00			`// parser based on http://schier.co/blog/2015/04/26/a-simple-web-scraper-in-go.html`
			`z := html.NewTokenizer(strings.NewReader(contents))`
			`for {`
			`tt := z.Next()`
			`if tt == html.ErrorToken {`
			`break`
			`}`
			`t := z.Token()`

			`// check for an href and src attributes`
			`// TODO: don't crawl links with nofollow`

			`if tt == html.StartTagToken {`
Scan CSS style tags and stylesheet links 2016-05-26 09:44:55 +00:00			`// links`
			`if t.Data == "a" {`
Find images and links by parsing HTML; support HTTPS and subdomains 2016-05-24 10:13:00 +00:00			`linkUrl := utils.GetAttribute(t, "href")`
			`if len(linkUrl) > 1 {`
			`domains = append(domains, linkUrl)`
			`}`
			`}`
			`}`

Scan CSS style tags and stylesheet links 2016-05-26 09:44:55 +00:00			`// css <link>`
			`if t.Data == "link" && utils.GetAttribute(t, "rel") == "stylesheet" {`
			`cssLinks = append(cssLinks, utils.GetAttribute(t, "href"))`
			`}`

			`// images`
			`if t.Data == "img" {`
Find images and links by parsing HTML; support HTTPS and subdomains 2016-05-24 10:13:00 +00:00			`imageUrl := utils.GetAttribute(t, "src")`

			`baseUrl, _ := url.Parse(imageUrl)`
			`if utils.WithoutSubdomains(baseUrl.Host) == utils.WithoutSubdomains(report.HiddenService) {`
			`scan.ScanPage(report.HiddenService, utils.WithoutProtocol(imageUrl), report, CheckExif)`
			`log.Printf("\t Found internal image %s\n", imageUrl)`
			`} else {`
			`log.Printf("\t Not scanning remote image %s\n", imageUrl)`
			`}`
			`}`
			`}`

Scan CSS style tags and stylesheet links 2016-05-26 09:44:55 +00:00			`log.Printf("\tScanning for CSS Fonts and Background Images\n")`
			`for _, cssUrl := range cssLinks {`
			`log.Printf("\tScanning CSS file: %s\n", cssUrl)`
			`_, cssContents, _ := scan.ScrapePage(report.HiddenService, utils.WithoutProtocol(cssUrl))`
			`domains = append(domains, utils.ExtractDomains(string(cssContents))[0:]...)`
			`}`
Find images and links by parsing HTML; support HTTPS and subdomains 2016-05-24 10:13:00 +00:00
Scan CSS style tags and stylesheet links 2016-05-26 09:44:55 +00:00			`log.Printf("\tScanning for Links\n")`
			`domains = append(domains, utils.ExtractDomains(contents)...)`
Pull referenced directories from page scan and scan them along with common directories in http scanner 2016-04-24 17:28:59 +00:00			`for _, domain := range domains {`
Find images and links by parsing HTML; support HTTPS and subdomains 2016-05-24 10:13:00 +00:00			`baseUrl, _ := url.Parse(domain)`
			`if baseUrl.Host != "" && utils.WithoutSubdomains(baseUrl.Host) != utils.WithoutSubdomains(report.HiddenService) {`
Extract all URLs during standard page scan. Currently not pulled through into reporting. 2016-04-13 05:03:39 +00:00			`log.Printf("Found Related URL %s\n", domain)`
			`// TODO: Lots of information here which needs to be processed.`
			`// * Links to standard sites - google / bitpay etc.`
			`// * Links to other onion sites`
			`// * Links to obscure clearnet sites.`
New Protocols Scans, SSH Fingerprinting * SSH Fingerprint * Page Snapshot * A few new Protocol Tests (FTP, SMTP, Ricochet, IRC) 2016-04-25 02:46:28 +00:00			`report.AddLinkedSite(baseUrl.Host)`
Extract all URLs during standard page scan. Currently not pulled through into reporting. 2016-04-13 05:03:39 +00:00			`} else {`
Find images and links by parsing HTML; support HTTPS and subdomains 2016-05-24 10:13:00 +00:00			`// * Process FQDN internal links`
New Protocols Scans, SSH Fingerprinting * SSH Fingerprint * Page Snapshot * A few new Protocol Tests (FTP, SMTP, Ricochet, IRC) 2016-04-25 02:46:28 +00:00			`log.Printf("Found Internal URL %s\n", domain)`
Find images and links by parsing HTML; support HTTPS and subdomains 2016-05-24 10:13:00 +00:00			`report.AddInternalPage(baseUrl.Host)`
Extract all URLs during standard page scan. Currently not pulled through into reporting. 2016-04-13 05:03:39 +00:00			`}`
Pull referenced directories from page scan and scan them along with common directories in http scanner 2016-04-24 17:28:59 +00:00			`}`
Extract all URLs during standard page scan. Currently not pulled through into reporting. 2016-04-13 05:03:39 +00:00
Pull referenced directories from page scan and scan them along with common directories in http scanner 2016-04-24 17:28:59 +00:00			`log.Printf("\tScanning for Referenced Directories\n")`
Find images and links by parsing HTML; support HTTPS and subdomains 2016-05-24 10:13:00 +00:00			`r := regexp.MustCompile("(src\|href)=\"([^\"]*)\"")`
Pull referenced directories from page scan and scan them along with common directories in http scanner 2016-04-24 17:28:59 +00:00			`foundPaths := r.FindAllStringSubmatch(string(contents), -1)`
			`for _, regexpResults := range foundPaths {`
			`path := regexpResults[2]`
Scan CSS style tags and stylesheet links 2016-05-26 09:44:55 +00:00			`if (strings.HasPrefix(path, "http") \|\| strings.HasPrefix(path, "//")) && !strings.Contains(path, utils.WithoutSubdomains(report.HiddenService)) {`
Pull referenced directories from page scan and scan them along with common directories in http scanner 2016-04-24 17:28:59 +00:00			`continue`
			`}`

			`term := strings.LastIndex(path, "/")`
			`if term > 0 {`
			`log.Printf("\t Found Referenced Directory %s\n", path[:term])`
Find images and links by parsing HTML; support HTTPS and subdomains 2016-05-24 10:13:00 +00:00			`report.AddPageReferencedDirectory(utils.WithoutProtocol(path[:term]))`
Pull referenced directories from page scan and scan them along with common directories in http scanner 2016-04-24 17:28:59 +00:00			`}`
			`}`
Initial Commit 2016-04-10 00:04:22 +00:00			`} else if status == 403 {`
			`log.Printf("\tPage %s%s is Forbidden\n", report.HiddenService, page)`
			`} else if status == 404 {`
			`log.Printf("\tPage %s%s is Does Not Exist\n", report.HiddenService, page)`
			`}`
			`}`