2016-04-10 00:04:22 +00:00
|
|
|
package scans
|
|
|
|
|
|
|
|
import (
|
2016-04-24 17:28:59 +00:00
|
|
|
"crypto/sha1"
|
|
|
|
"encoding/hex"
|
2016-04-10 00:04:22 +00:00
|
|
|
"github.com/s-rah/onionscan/report"
|
2016-04-13 05:03:39 +00:00
|
|
|
"github.com/s-rah/onionscan/utils"
|
2016-05-24 10:13:00 +00:00
|
|
|
"golang.org/x/net/html"
|
2016-04-10 00:04:22 +00:00
|
|
|
"log"
|
2016-04-24 17:28:59 +00:00
|
|
|
"net/url"
|
2016-04-10 00:04:22 +00:00
|
|
|
"regexp"
|
2016-04-13 05:03:39 +00:00
|
|
|
"strings"
|
2016-04-10 00:04:22 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
func StandardPageScan(scan Scanner, page string, status int, contents string, report *report.OnionScanReport) {
|
2016-05-24 10:13:00 +00:00
|
|
|
log.Printf("Scanning %s\n", page)
|
2016-04-10 00:04:22 +00:00
|
|
|
if status == 200 {
|
2016-05-24 10:13:00 +00:00
|
|
|
log.Printf("\tPage %s is Accessible\n", page)
|
2016-04-24 17:28:59 +00:00
|
|
|
|
2016-04-25 02:46:28 +00:00
|
|
|
hash := sha1.Sum([]byte(contents))
|
|
|
|
report.Hashes = append(report.Hashes, hex.EncodeToString(hash[:]))
|
|
|
|
report.Snapshot = contents
|
2016-04-10 00:04:22 +00:00
|
|
|
|
2016-04-25 09:29:27 +00:00
|
|
|
// Try resolve page title if present
|
|
|
|
isTitlePresent := strings.Contains(contents, "<title>")
|
|
|
|
if isTitlePresent {
|
|
|
|
var startIndex = strings.Index(contents, "<title>")
|
|
|
|
var endIndex = strings.Index(contents, "</title>")
|
2016-04-27 00:47:00 +00:00
|
|
|
var pageTitle = contents[startIndex+len("<title>") : endIndex]
|
2016-04-25 09:29:27 +00:00
|
|
|
log.Printf("\tPage Title: %s\n", pageTitle)
|
|
|
|
report.PageTitle = pageTitle
|
|
|
|
}
|
|
|
|
|
2016-05-10 01:42:19 +00:00
|
|
|
new(PGPContentScan).ScanContent(contents, report)
|
2016-05-24 10:13:00 +00:00
|
|
|
|
|
|
|
log.Printf("\tScanning for Images\n")
|
2016-05-26 09:44:55 +00:00
|
|
|
var domains []string
|
|
|
|
var cssLinks []string
|
2016-04-24 17:28:59 +00:00
|
|
|
|
2016-05-24 10:13:00 +00:00
|
|
|
// parser based on http://schier.co/blog/2015/04/26/a-simple-web-scraper-in-go.html
|
|
|
|
z := html.NewTokenizer(strings.NewReader(contents))
|
|
|
|
for {
|
|
|
|
tt := z.Next()
|
|
|
|
if tt == html.ErrorToken {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
t := z.Token()
|
|
|
|
|
|
|
|
// check for an href and src attributes
|
|
|
|
// TODO: don't crawl links with nofollow
|
|
|
|
|
|
|
|
if tt == html.StartTagToken {
|
2016-05-26 09:44:55 +00:00
|
|
|
// links
|
|
|
|
if t.Data == "a" {
|
2016-05-24 10:13:00 +00:00
|
|
|
linkUrl := utils.GetAttribute(t, "href")
|
|
|
|
if len(linkUrl) > 1 {
|
|
|
|
domains = append(domains, linkUrl)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-05-26 09:44:55 +00:00
|
|
|
// css <link>
|
|
|
|
if t.Data == "link" && utils.GetAttribute(t, "rel") == "stylesheet" {
|
|
|
|
cssLinks = append(cssLinks, utils.GetAttribute(t, "href"))
|
|
|
|
}
|
|
|
|
|
|
|
|
// images
|
|
|
|
if t.Data == "img" {
|
2016-05-24 10:13:00 +00:00
|
|
|
imageUrl := utils.GetAttribute(t, "src")
|
|
|
|
|
|
|
|
baseUrl, _ := url.Parse(imageUrl)
|
|
|
|
if utils.WithoutSubdomains(baseUrl.Host) == utils.WithoutSubdomains(report.HiddenService) {
|
|
|
|
scan.ScanPage(report.HiddenService, utils.WithoutProtocol(imageUrl), report, CheckExif)
|
|
|
|
log.Printf("\t Found internal image %s\n", imageUrl)
|
|
|
|
} else {
|
|
|
|
log.Printf("\t Not scanning remote image %s\n", imageUrl)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-05-26 09:44:55 +00:00
|
|
|
log.Printf("\tScanning for CSS Fonts and Background Images\n")
|
|
|
|
for _, cssUrl := range cssLinks {
|
|
|
|
log.Printf("\tScanning CSS file: %s\n", cssUrl)
|
|
|
|
_, cssContents, _ := scan.ScrapePage(report.HiddenService, utils.WithoutProtocol(cssUrl))
|
|
|
|
domains = append(domains, utils.ExtractDomains(string(cssContents))[0:]...)
|
|
|
|
}
|
2016-05-24 10:13:00 +00:00
|
|
|
|
2016-05-26 09:44:55 +00:00
|
|
|
log.Printf("\tScanning for Links\n")
|
|
|
|
domains = append(domains, utils.ExtractDomains(contents)...)
|
2016-04-24 17:28:59 +00:00
|
|
|
for _, domain := range domains {
|
2016-05-24 10:13:00 +00:00
|
|
|
baseUrl, _ := url.Parse(domain)
|
|
|
|
if baseUrl.Host != "" && utils.WithoutSubdomains(baseUrl.Host) != utils.WithoutSubdomains(report.HiddenService) {
|
2016-04-13 05:03:39 +00:00
|
|
|
log.Printf("Found Related URL %s\n", domain)
|
|
|
|
// TODO: Lots of information here which needs to be processed.
|
|
|
|
// * Links to standard sites - google / bitpay etc.
|
|
|
|
// * Links to other onion sites
|
|
|
|
// * Links to obscure clearnet sites.
|
2016-04-25 02:46:28 +00:00
|
|
|
report.AddLinkedSite(baseUrl.Host)
|
2016-04-13 05:03:39 +00:00
|
|
|
} else {
|
2016-05-24 10:13:00 +00:00
|
|
|
// * Process FQDN internal links
|
2016-04-25 02:46:28 +00:00
|
|
|
log.Printf("Found Internal URL %s\n", domain)
|
2016-05-24 10:13:00 +00:00
|
|
|
report.AddInternalPage(baseUrl.Host)
|
2016-04-13 05:03:39 +00:00
|
|
|
}
|
2016-04-24 17:28:59 +00:00
|
|
|
}
|
2016-04-13 05:03:39 +00:00
|
|
|
|
2016-04-24 17:28:59 +00:00
|
|
|
log.Printf("\tScanning for Referenced Directories\n")
|
2016-05-24 10:13:00 +00:00
|
|
|
r := regexp.MustCompile("(src|href)=\"([^\"]*)\"")
|
2016-04-24 17:28:59 +00:00
|
|
|
foundPaths := r.FindAllStringSubmatch(string(contents), -1)
|
|
|
|
for _, regexpResults := range foundPaths {
|
|
|
|
path := regexpResults[2]
|
2016-05-26 09:44:55 +00:00
|
|
|
if (strings.HasPrefix(path, "http") || strings.HasPrefix(path, "//")) && !strings.Contains(path, utils.WithoutSubdomains(report.HiddenService)) {
|
2016-04-24 17:28:59 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
term := strings.LastIndex(path, "/")
|
|
|
|
if term > 0 {
|
|
|
|
log.Printf("\t Found Referenced Directory %s\n", path[:term])
|
2016-05-24 10:13:00 +00:00
|
|
|
report.AddPageReferencedDirectory(utils.WithoutProtocol(path[:term]))
|
2016-04-24 17:28:59 +00:00
|
|
|
}
|
|
|
|
}
|
2016-04-10 00:04:22 +00:00
|
|
|
} else if status == 403 {
|
|
|
|
log.Printf("\tPage %s%s is Forbidden\n", report.HiddenService, page)
|
|
|
|
} else if status == 404 {
|
|
|
|
log.Printf("\tPage %s%s is Does Not Exist\n", report.HiddenService, page)
|
|
|
|
}
|
|
|
|
}
|