generated from templates/go-generic
feat: add external links to the report #3
5 changed files with 140 additions and 107 deletions
14
README.md
14
README.md
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
This web crawler crawls a given URL and generates a report for all the internal links it finds.
|
This web crawler crawls a given website and generates a report for all the internal and external links found during the crawl.
|
||||||
|
|
||||||
### Repository mirrors
|
### Repository mirrors
|
||||||
|
|
||||||
|
@ -21,9 +21,15 @@ git clone https://github.com/dananglin/web-crawler.git
|
||||||
```
|
```
|
||||||
|
|
||||||
Build the application.
|
Build the application.
|
||||||
```
|
|
||||||
go build -o crawler .
|
- Build with go
|
||||||
```
|
```
|
||||||
|
go build -o crawler .
|
||||||
|
```
|
||||||
|
- Or build with [mage](https://magefile.org/) if you have it installed.
|
||||||
|
```
|
||||||
|
mage build
|
||||||
|
```
|
||||||
|
|
||||||
Run the application specifying the website that you want to crawl.
|
Run the application specifying the website that you want to crawl.
|
||||||
|
|
||||||
|
|
|
@ -6,12 +6,11 @@ import (
|
||||||
"os"
|
"os"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/report"
|
|
||||||
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
|
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Crawler struct {
|
type Crawler struct {
|
||||||
pages map[string]int
|
pages map[string]pageStat
|
||||||
baseURL *url.URL
|
baseURL *url.URL
|
||||||
mu *sync.Mutex
|
mu *sync.Mutex
|
||||||
workerPool chan struct{}
|
workerPool chan struct{}
|
||||||
|
@ -19,6 +18,11 @@ type Crawler struct {
|
||||||
maxPages int
|
maxPages int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type pageStat struct {
|
||||||
|
count int
|
||||||
|
internal bool
|
||||||
|
}
|
||||||
|
|
||||||
func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
|
func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
|
||||||
baseURL, err := url.Parse(rawBaseURL)
|
baseURL, err := url.Parse(rawBaseURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -30,7 +34,7 @@ func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
|
||||||
waitGroup.Add(1)
|
waitGroup.Add(1)
|
||||||
|
|
||||||
crawler := Crawler{
|
crawler := Crawler{
|
||||||
pages: make(map[string]int),
|
pages: make(map[string]pageStat),
|
||||||
baseURL: baseURL,
|
baseURL: baseURL,
|
||||||
mu: &sync.Mutex{},
|
mu: &sync.Mutex{},
|
||||||
workerPool: make(chan struct{}, maxWorkers),
|
workerPool: make(chan struct{}, maxWorkers),
|
||||||
|
@ -56,23 +60,6 @@ func (c *Crawler) Crawl(rawCurrentURL string) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// if current URL is not on the same domain as the base URL then return early.
|
|
||||||
hasEqualDomain, err := c.HasEqualDomain(rawCurrentURL)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Printf(
|
|
||||||
"WARNING: Unable to determine if %q has the same domain as %q; %v.\n",
|
|
||||||
rawCurrentURL,
|
|
||||||
c.baseURL.Hostname(),
|
|
||||||
err,
|
|
||||||
)
|
|
||||||
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if !hasEqualDomain {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// get normalised version of rawCurrentURL
|
// get normalised version of rawCurrentURL
|
||||||
normalisedCurrentURL, err := util.NormaliseURL(rawCurrentURL)
|
normalisedCurrentURL, err := util.NormaliseURL(rawCurrentURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -81,9 +68,25 @@ func (c *Crawler) Crawl(rawCurrentURL string) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
isInternalLink, err := c.isInternalLink(rawCurrentURL)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf(
|
||||||
|
"WARNING: Unable to determine if %q is an internal link; %v.\n",
|
||||||
|
rawCurrentURL,
|
||||||
|
err,
|
||||||
|
)
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// Add (or update) a record of the URL in the pages map.
|
// Add (or update) a record of the URL in the pages map.
|
||||||
// If there's already an entry of the URL in the map then return early.
|
// If there's already an entry of the URL in the map then return early.
|
||||||
if existed := c.AddPageVisit(normalisedCurrentURL); existed {
|
if existed := c.addPageVisit(normalisedCurrentURL, isInternalLink); existed {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// if current URL is an external link then return early.
|
||||||
|
if !isInternalLink {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -119,7 +122,10 @@ func (c *Crawler) Crawl(rawCurrentURL string) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Crawler) HasEqualDomain(rawURL string) (bool, error) {
|
// isInternalLink evaluates whether the input URL is an internal link to the
|
||||||
|
// base URL. An internal link is determined by comparing the host names of both
|
||||||
|
// the input and base URLs.
|
||||||
|
func (c *Crawler) isInternalLink(rawURL string) (bool, error) {
|
||||||
parsedRawURL, err := url.Parse(rawURL)
|
parsedRawURL, err := url.Parse(rawURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, fmt.Errorf("error parsing the URL %q: %w", rawURL, err)
|
return false, fmt.Errorf("error parsing the URL %q: %w", rawURL, err)
|
||||||
|
@ -132,16 +138,21 @@ func (c *Crawler) HasEqualDomain(rawURL string) (bool, error) {
|
||||||
// If there is already a record of the URL then it's record is updated (incremented)
|
// If there is already a record of the URL then it's record is updated (incremented)
|
||||||
// and the method returns true. If the URL is not already recorded then it is created
|
// and the method returns true. If the URL is not already recorded then it is created
|
||||||
// and the method returns false.
|
// and the method returns false.
|
||||||
func (c *Crawler) AddPageVisit(normalisedURL string) bool {
|
func (c *Crawler) addPageVisit(normalisedURL string, internal bool) bool {
|
||||||
c.mu.Lock()
|
c.mu.Lock()
|
||||||
defer c.mu.Unlock()
|
defer c.mu.Unlock()
|
||||||
|
|
||||||
_, exists := c.pages[normalisedURL]
|
_, exists := c.pages[normalisedURL]
|
||||||
|
|
||||||
if exists {
|
if exists {
|
||||||
c.pages[normalisedURL]++
|
stat := c.pages[normalisedURL]
|
||||||
|
stat.count++
|
||||||
|
c.pages[normalisedURL] = stat
|
||||||
} else {
|
} else {
|
||||||
c.pages[normalisedURL] = 1
|
c.pages[normalisedURL] = pageStat{
|
||||||
|
count: 1,
|
||||||
|
internal: internal,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return exists
|
return exists
|
||||||
|
@ -155,7 +166,7 @@ func (c *Crawler) PrintReport() {
|
||||||
c.mu.Lock()
|
c.mu.Lock()
|
||||||
defer c.mu.Unlock()
|
defer c.mu.Unlock()
|
||||||
|
|
||||||
r := report.NewReport(c.baseURL.String(), c.pages)
|
r := newReport(c.baseURL.String(), c.pages)
|
||||||
|
|
||||||
fmt.Fprint(os.Stdout, r)
|
fmt.Fprint(os.Stdout, r)
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,18 +1,17 @@
|
||||||
package crawler_test
|
package crawler
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"slices"
|
"slices"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
|
|
||||||
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
|
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestCrawler(t *testing.T) {
|
func TestCrawler(t *testing.T) {
|
||||||
testBaseURL := "https://example.com"
|
testBaseURL := "https://example.com"
|
||||||
|
|
||||||
testCrawler, err := crawler.NewCrawler(testBaseURL, 1, 10)
|
testCrawler, err := NewCrawler(testBaseURL, 1, 10)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("Test 'TestCrawler' FAILED: unexpected error creating the crawler: %v", err)
|
t.Fatalf("Test 'TestCrawler' FAILED: unexpected error creating the crawler: %v", err)
|
||||||
}
|
}
|
||||||
|
@ -50,7 +49,7 @@ func TestCrawler(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for ind, tc := range slices.All(testCasesForEqualDomains) {
|
for ind, tc := range slices.All(testCasesForEqualDomains) {
|
||||||
t.Run(tc.name, testHasEqualDomains(
|
t.Run(tc.name, testIsInternalLink(
|
||||||
testCrawler,
|
testCrawler,
|
||||||
ind+1,
|
ind+1,
|
||||||
tc.name,
|
tc.name,
|
||||||
|
@ -83,7 +82,7 @@ func TestCrawler(t *testing.T) {
|
||||||
|
|
||||||
for ind, tc := range slices.All(testCasesForPages) {
|
for ind, tc := range slices.All(testCasesForPages) {
|
||||||
name := fmt.Sprintf("Adding %s to the pages map", tc.rawURL)
|
name := fmt.Sprintf("Adding %s to the pages map", tc.rawURL)
|
||||||
t.Run(name, testAddPageVisit(
|
t.Run(name, testHasVisited(
|
||||||
testCrawler,
|
testCrawler,
|
||||||
ind+1,
|
ind+1,
|
||||||
name,
|
name,
|
||||||
|
@ -93,8 +92,8 @@ func TestCrawler(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func testHasEqualDomains(
|
func testIsInternalLink(
|
||||||
testCrawler *crawler.Crawler,
|
testCrawler *Crawler,
|
||||||
testNum int,
|
testNum int,
|
||||||
testName string,
|
testName string,
|
||||||
rawURL string,
|
rawURL string,
|
||||||
|
@ -103,7 +102,7 @@ func testHasEqualDomains(
|
||||||
return func(t *testing.T) {
|
return func(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
got, err := testCrawler.HasEqualDomain(rawURL)
|
got, err := testCrawler.isInternalLink(rawURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf(
|
t.Fatalf(
|
||||||
"Test %d - '%s' FAILED: unexpected error: %v",
|
"Test %d - '%s' FAILED: unexpected error: %v",
|
||||||
|
@ -132,8 +131,8 @@ func testHasEqualDomains(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func testAddPageVisit(
|
func testHasVisited(
|
||||||
testCrawler *crawler.Crawler,
|
testCrawler *Crawler,
|
||||||
testNum int,
|
testNum int,
|
||||||
testName string,
|
testName string,
|
||||||
rawURL string,
|
rawURL string,
|
||||||
|
@ -150,7 +149,7 @@ func testAddPageVisit(
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
gotVisited := testCrawler.AddPageVisit(normalisedURL)
|
gotVisited := testCrawler.addPageVisit(normalisedURL, true)
|
||||||
|
|
||||||
if gotVisited != wantVisited {
|
if gotVisited != wantVisited {
|
||||||
t.Errorf(
|
t.Errorf(
|
||||||
|
|
83
internal/crawler/report.go
Normal file
83
internal/crawler/report.go
Normal file
|
@ -0,0 +1,83 @@
|
||||||
|
package crawler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"cmp"
|
||||||
|
"maps"
|
||||||
|
"slices"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
type report struct {
|
||||||
|
baseURL string
|
||||||
|
records []record
|
||||||
|
}
|
||||||
|
|
||||||
|
type record struct {
|
||||||
|
link string
|
||||||
|
count int
|
||||||
|
internal bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func newReport(baseURL string, pages map[string]pageStat) report {
|
||||||
|
records := make([]record, 0)
|
||||||
|
|
||||||
|
for link, stats := range maps.All(pages) {
|
||||||
|
record := record{
|
||||||
|
link: link,
|
||||||
|
count: stats.count,
|
||||||
|
internal: stats.internal,
|
||||||
|
}
|
||||||
|
|
||||||
|
records = append(records, record)
|
||||||
|
}
|
||||||
|
|
||||||
|
report := report{
|
||||||
|
baseURL: baseURL,
|
||||||
|
records: records,
|
||||||
|
}
|
||||||
|
|
||||||
|
report.sortRecords()
|
||||||
|
|
||||||
|
return report
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *report) sortRecords() {
|
||||||
|
// First sort records by count (in reverse order hopefully)
|
||||||
|
// Then sort records by name if two elements have the same count.
|
||||||
|
slices.SortFunc(r.records, func(a, b record) int {
|
||||||
|
if n := cmp.Compare(a.count, b.count); n != 0 {
|
||||||
|
return -1 * n
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.Compare(a.link, b.link)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r report) String() string {
|
||||||
|
var builder strings.Builder
|
||||||
|
|
||||||
|
titlebar := strings.Repeat("\u2500", 80)
|
||||||
|
|
||||||
|
builder.WriteString("\n" + titlebar)
|
||||||
|
builder.WriteString("\n" + "REPORT for " + r.baseURL)
|
||||||
|
builder.WriteString("\n" + titlebar)
|
||||||
|
|
||||||
|
for ind := range slices.All(r.records) {
|
||||||
|
linkType := "internal"
|
||||||
|
if !r.records[ind].internal {
|
||||||
|
linkType = "external"
|
||||||
|
}
|
||||||
|
|
||||||
|
links := "links"
|
||||||
|
if r.records[ind].count == 1 {
|
||||||
|
links = "link"
|
||||||
|
}
|
||||||
|
|
||||||
|
builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " " + linkType + " " + links + " to " + r.records[ind].link)
|
||||||
|
}
|
||||||
|
|
||||||
|
builder.WriteString("\n")
|
||||||
|
|
||||||
|
return builder.String()
|
||||||
|
}
|
|
@ -1,66 +0,0 @@
|
||||||
package report
|
|
||||||
|
|
||||||
import (
|
|
||||||
"cmp"
|
|
||||||
"maps"
|
|
||||||
"slices"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Report struct {
|
|
||||||
baseURL string
|
|
||||||
records []Record
|
|
||||||
}
|
|
||||||
|
|
||||||
type Record struct {
|
|
||||||
link string
|
|
||||||
count int
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewReport(baseURL string, pages map[string]int) Report {
|
|
||||||
records := make([]Record, 0)
|
|
||||||
|
|
||||||
for link, count := range maps.All(pages) {
|
|
||||||
records = append(records, Record{link: link, count: count})
|
|
||||||
}
|
|
||||||
|
|
||||||
report := Report{
|
|
||||||
baseURL: baseURL,
|
|
||||||
records: records,
|
|
||||||
}
|
|
||||||
|
|
||||||
report.SortRecords()
|
|
||||||
|
|
||||||
return report
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Report) SortRecords() {
|
|
||||||
// First sort records by count (in reverse order hopefully)
|
|
||||||
// Then sort records by name if two elements have the same count.
|
|
||||||
slices.SortFunc(r.records, func(a, b Record) int {
|
|
||||||
if n := cmp.Compare(a.count, b.count); n != 0 {
|
|
||||||
return -1 * n
|
|
||||||
}
|
|
||||||
|
|
||||||
return strings.Compare(a.link, b.link)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r Report) String() string {
|
|
||||||
var builder strings.Builder
|
|
||||||
|
|
||||||
titlebar := strings.Repeat("\u2500", 80)
|
|
||||||
|
|
||||||
builder.WriteString("\n" + titlebar)
|
|
||||||
builder.WriteString("\n" + "REPORT for " + r.baseURL)
|
|
||||||
builder.WriteString("\n" + titlebar)
|
|
||||||
|
|
||||||
for ind := range slices.All(r.records) {
|
|
||||||
builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " internal links to " + r.records[ind].link)
|
|
||||||
}
|
|
||||||
|
|
||||||
builder.WriteString("\n")
|
|
||||||
|
|
||||||
return builder.String()
|
|
||||||
}
|
|
Loading…
Reference in a new issue