Merge pull request #38 from mapmeld/master
Find images and links by parsing HTML
This commit is contained in:
commit
5b0e733da3
|
@ -6,6 +6,7 @@ import (
|
||||||
"github.com/s-rah/onionscan/config"
|
"github.com/s-rah/onionscan/config"
|
||||||
"github.com/s-rah/onionscan/protocol"
|
"github.com/s-rah/onionscan/protocol"
|
||||||
"github.com/s-rah/onionscan/report"
|
"github.com/s-rah/onionscan/report"
|
||||||
|
"github.com/s-rah/onionscan/utils"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -16,10 +17,7 @@ type OnionScan struct {
|
||||||
func (os *OnionScan) Scan(hiddenService string) (*report.OnionScanReport, error) {
|
func (os *OnionScan) Scan(hiddenService string) (*report.OnionScanReport, error) {
|
||||||
|
|
||||||
// Remove Extra Prefix
|
// Remove Extra Prefix
|
||||||
// TODO: Add support for HTTPS?
|
hiddenService = utils.WithoutProtocol(hiddenService)
|
||||||
if strings.HasPrefix(hiddenService, "http://") {
|
|
||||||
hiddenService = hiddenService[7:]
|
|
||||||
}
|
|
||||||
|
|
||||||
if strings.HasSuffix(hiddenService, "/") {
|
if strings.HasSuffix(hiddenService, "/") {
|
||||||
hiddenService = hiddenService[0 : len(hiddenService)-1]
|
hiddenService = hiddenService[0 : len(hiddenService)-1]
|
||||||
|
|
|
@ -76,9 +76,12 @@ func (hps *HTTPProtocolScanner) ScanProtocol(hiddenService string, onionscanConf
|
||||||
}
|
}
|
||||||
|
|
||||||
func (hps *HTTPProtocolScanner) ScanPage(hiddenService string, page string, report *report.OnionScanReport, f func(scans.Scanner, string, int, string, *report.OnionScanReport)) {
|
func (hps *HTTPProtocolScanner) ScanPage(hiddenService string, page string, report *report.OnionScanReport, f func(scans.Scanner, string, int, string, *report.OnionScanReport)) {
|
||||||
response, err := hps.Client.Get("http://" + hiddenService + page)
|
if !strings.Contains(page, utils.WithoutSubdomains(hiddenService)) {
|
||||||
|
page = hiddenService + page
|
||||||
|
}
|
||||||
|
response, err := hps.Client.Get("http://" + page)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Error connecting to %s%s %s\n", hiddenService, page, err)
|
log.Printf("Error connecting to http://%s %s\n", page, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer response.Body.Close()
|
defer response.Body.Close()
|
||||||
|
|
|
@ -33,6 +33,7 @@ type OnionScanReport struct {
|
||||||
RelatedOnionServices []string `json:"relatedOnionServices"`
|
RelatedOnionServices []string `json:"relatedOnionServices"`
|
||||||
RelatedClearnetDomains []string `json:"relatedOnionDomains"`
|
RelatedClearnetDomains []string `json:"relatedOnionDomains"`
|
||||||
LinkedSites []string `json:"linkedSites"`
|
LinkedSites []string `json:"linkedSites"`
|
||||||
|
InternalPages []string `json:"InternalPages"`
|
||||||
IP []string `json:"ipAddresses"`
|
IP []string `json:"ipAddresses"`
|
||||||
OpenDirectories []string `json:"openDirectories"`
|
OpenDirectories []string `json:"openDirectories"`
|
||||||
ExifImages []ExifImage `json:"exifImages"`
|
ExifImages []ExifImage `json:"exifImages"`
|
||||||
|
@ -86,6 +87,11 @@ func (osr *OnionScanReport) AddLinkedSite(site string) {
|
||||||
utils.RemoveDuplicates(&osr.LinkedSites)
|
utils.RemoveDuplicates(&osr.LinkedSites)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (osr *OnionScanReport) AddInternalPage(site string) {
|
||||||
|
osr.InternalPages = append(osr.InternalPages, site)
|
||||||
|
utils.RemoveDuplicates(&osr.InternalPages)
|
||||||
|
}
|
||||||
|
|
||||||
func (osr *OnionScanReport) AddPGPKey(key string) {
|
func (osr *OnionScanReport) AddPGPKey(key string) {
|
||||||
osr.PGPKeys = append(osr.PGPKeys, key)
|
osr.PGPKeys = append(osr.PGPKeys, key)
|
||||||
utils.RemoveDuplicates(&osr.PGPKeys)
|
utils.RemoveDuplicates(&osr.PGPKeys)
|
||||||
|
|
|
@ -5,6 +5,7 @@ import (
|
||||||
"encoding/hex"
|
"encoding/hex"
|
||||||
"github.com/s-rah/onionscan/report"
|
"github.com/s-rah/onionscan/report"
|
||||||
"github.com/s-rah/onionscan/utils"
|
"github.com/s-rah/onionscan/utils"
|
||||||
|
"golang.org/x/net/html"
|
||||||
"log"
|
"log"
|
||||||
"net/url"
|
"net/url"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
@ -12,9 +13,9 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
func StandardPageScan(scan Scanner, page string, status int, contents string, report *report.OnionScanReport) {
|
func StandardPageScan(scan Scanner, page string, status int, contents string, report *report.OnionScanReport) {
|
||||||
log.Printf("Scanning %s%s\n", report.HiddenService, page)
|
log.Printf("Scanning %s\n", page)
|
||||||
if status == 200 {
|
if status == 200 {
|
||||||
log.Printf("\tPage %s%s is Accessible\n", report.HiddenService, page)
|
log.Printf("\tPage %s is Accessible\n", page)
|
||||||
|
|
||||||
hash := sha1.Sum([]byte(contents))
|
hash := sha1.Sum([]byte(contents))
|
||||||
report.Hashes = append(report.Hashes, hex.EncodeToString(hash[:]))
|
report.Hashes = append(report.Hashes, hex.EncodeToString(hash[:]))
|
||||||
|
@ -31,44 +32,77 @@ func StandardPageScan(scan Scanner, page string, status int, contents string, re
|
||||||
}
|
}
|
||||||
|
|
||||||
new(PGPContentScan).ScanContent(contents, report)
|
new(PGPContentScan).ScanContent(contents, report)
|
||||||
|
|
||||||
|
log.Printf("\tScanning for Images\n")
|
||||||
domains := utils.ExtractDomains(contents)
|
domains := utils.ExtractDomains(contents)
|
||||||
|
|
||||||
|
// parser based on http://schier.co/blog/2015/04/26/a-simple-web-scraper-in-go.html
|
||||||
|
z := html.NewTokenizer(strings.NewReader(contents))
|
||||||
|
for {
|
||||||
|
tt := z.Next()
|
||||||
|
if tt == html.ErrorToken {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
t := z.Token()
|
||||||
|
|
||||||
|
// check for an href and src attributes
|
||||||
|
// TODO: don't crawl links with nofollow
|
||||||
|
|
||||||
|
if tt == html.StartTagToken {
|
||||||
|
isLink := t.Data == "a"
|
||||||
|
if isLink {
|
||||||
|
linkUrl := utils.GetAttribute(t, "href")
|
||||||
|
if len(linkUrl) > 1 {
|
||||||
|
domains = append(domains, linkUrl)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
isImage := t.Data == "img"
|
||||||
|
if isImage {
|
||||||
|
imageUrl := utils.GetAttribute(t, "src")
|
||||||
|
|
||||||
|
baseUrl, _ := url.Parse(imageUrl)
|
||||||
|
if utils.WithoutSubdomains(baseUrl.Host) == utils.WithoutSubdomains(report.HiddenService) {
|
||||||
|
scan.ScanPage(report.HiddenService, utils.WithoutProtocol(imageUrl), report, CheckExif)
|
||||||
|
log.Printf("\t Found internal image %s\n", imageUrl)
|
||||||
|
} else {
|
||||||
|
log.Printf("\t Not scanning remote image %s\n", imageUrl)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Printf("\tScanning for Links\n")
|
||||||
|
|
||||||
for _, domain := range domains {
|
for _, domain := range domains {
|
||||||
if !strings.HasPrefix(domain, "http://"+report.HiddenService) {
|
baseUrl, _ := url.Parse(domain)
|
||||||
|
if baseUrl.Host != "" && utils.WithoutSubdomains(baseUrl.Host) != utils.WithoutSubdomains(report.HiddenService) {
|
||||||
log.Printf("Found Related URL %s\n", domain)
|
log.Printf("Found Related URL %s\n", domain)
|
||||||
// TODO: Lots of information here which needs to be processed.
|
// TODO: Lots of information here which needs to be processed.
|
||||||
// * Links to standard sites - google / bitpay etc.
|
// * Links to standard sites - google / bitpay etc.
|
||||||
// * Links to other onion sites
|
// * Links to other onion sites
|
||||||
// * Links to obscure clearnet sites.
|
// * Links to obscure clearnet sites.
|
||||||
baseUrl, _ := url.Parse(domain)
|
|
||||||
report.AddLinkedSite(baseUrl.Host)
|
report.AddLinkedSite(baseUrl.Host)
|
||||||
} else {
|
} else {
|
||||||
// * Process FQDN internal links (unlikly)
|
// * Process FQDN internal links
|
||||||
log.Printf("Found Internal URL %s\n", domain)
|
log.Printf("Found Internal URL %s\n", domain)
|
||||||
|
report.AddInternalPage(baseUrl.Host)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Printf("\tScanning for Images\n")
|
|
||||||
r := regexp.MustCompile("src=\"(" + "http://" + report.HiddenService + "/)?((.*?\\.jpg)|(.*?\\.png)|(.*?\\.jpeg)|(.*?\\.gif))\"")
|
|
||||||
foundImages := r.FindAllStringSubmatch(string(contents), -1)
|
|
||||||
for _, image := range foundImages {
|
|
||||||
log.Printf("\t Found image %s\n", image[2])
|
|
||||||
scan.ScanPage(report.HiddenService, "/"+image[2], report, CheckExif)
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Printf("\tScanning for Referenced Directories\n")
|
log.Printf("\tScanning for Referenced Directories\n")
|
||||||
r = regexp.MustCompile("(src|href)=\"([^\"]*)\"")
|
r := regexp.MustCompile("(src|href)=\"([^\"]*)\"")
|
||||||
foundPaths := r.FindAllStringSubmatch(string(contents), -1)
|
foundPaths := r.FindAllStringSubmatch(string(contents), -1)
|
||||||
for _, regexpResults := range foundPaths {
|
for _, regexpResults := range foundPaths {
|
||||||
path := regexpResults[2]
|
path := regexpResults[2]
|
||||||
if strings.HasPrefix(path, "http") {
|
if strings.HasPrefix(path, "http") && !strings.Contains(path, utils.WithoutSubdomains(report.HiddenService)) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
term := strings.LastIndex(path, "/")
|
term := strings.LastIndex(path, "/")
|
||||||
if term > 0 {
|
if term > 0 {
|
||||||
log.Printf("\t Found Referenced Directory %s\n", path[:term])
|
log.Printf("\t Found Referenced Directory %s\n", path[:term])
|
||||||
report.AddPageReferencedDirectory(path[:term])
|
report.AddPageReferencedDirectory(utils.WithoutProtocol(path[:term]))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if status == 403 {
|
} else if status == 403 {
|
||||||
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
package utils
|
||||||
|
|
||||||
|
import "golang.org/x/net/html"
|
||||||
|
|
||||||
|
func GetAttribute(tag html.Token, name string) string {
|
||||||
|
for _, a := range tag.Attr {
|
||||||
|
if a.Key == name {
|
||||||
|
return a.Val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
|
@ -0,0 +1,29 @@
|
||||||
|
package utils
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/mvdan/xurls"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func ExtractDomains(content string) []string {
|
||||||
|
return xurls.Strict.FindAllString(content, -1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func WithoutSubdomains(urlhost string) string {
|
||||||
|
urlParts := strings.Split(urlhost, ".")
|
||||||
|
if len(urlParts) < 2 {
|
||||||
|
return ""
|
||||||
|
} else {
|
||||||
|
return strings.Join(urlParts[len(urlParts)-2:], ".")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func WithoutProtocol(url string) string {
|
||||||
|
if strings.HasPrefix(url, "http://") {
|
||||||
|
return url[7:]
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(url, "https://") {
|
||||||
|
return url[8:]
|
||||||
|
}
|
||||||
|
return url
|
||||||
|
}
|
|
@ -1,7 +0,0 @@
|
||||||
package utils
|
|
||||||
|
|
||||||
import "github.com/mvdan/xurls"
|
|
||||||
|
|
||||||
func ExtractDomains(content string) []string {
|
|
||||||
return xurls.Strict.FindAllString(content, -1)
|
|
||||||
}
|
|
Loading…
Reference in New Issue