feat: add external links to the report
All checks were successful
Tests / test (pull_request) Successful in 15s

This commit is contained in:
Dan Anglin 2024-08-28 07:39:24 +01:00
parent 0619c950f5
commit 5498ac7b4e
Signed by: dananglin
GPG key ID: 0C1D44CFBEE68638
5 changed files with 140 additions and 107 deletions

View file

@ -2,7 +2,7 @@
## Overview
This web crawler crawls a given URL and generates a report for all the internal links it finds.
This web crawler crawls a given website and generates a report for all the internal and external links found during the crawl.
### Repository mirrors
@ -21,9 +21,15 @@ git clone https://github.com/dananglin/web-crawler.git
```
Build the application.
```
go build -o crawler .
```
- Build with go
```
go build -o crawler .
```
- Or build with [mage](https://magefile.org/) if you have it installed.
```
mage build
```
Run the application specifying the website that you want to crawl.

View file

@ -6,12 +6,11 @@ import (
"os"
"sync"
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/report"
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
)
type Crawler struct {
pages map[string]int
pages map[string]pageStat
baseURL *url.URL
mu *sync.Mutex
workerPool chan struct{}
@ -19,6 +18,11 @@ type Crawler struct {
maxPages int
}
type pageStat struct {
count int
internal bool
}
func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
baseURL, err := url.Parse(rawBaseURL)
if err != nil {
@ -30,7 +34,7 @@ func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
waitGroup.Add(1)
crawler := Crawler{
pages: make(map[string]int),
pages: make(map[string]pageStat),
baseURL: baseURL,
mu: &sync.Mutex{},
workerPool: make(chan struct{}, maxWorkers),
@ -56,23 +60,6 @@ func (c *Crawler) Crawl(rawCurrentURL string) {
return
}
// if current URL is not on the same domain as the base URL then return early.
hasEqualDomain, err := c.HasEqualDomain(rawCurrentURL)
if err != nil {
fmt.Printf(
"WARNING: Unable to determine if %q has the same domain as %q; %v.\n",
rawCurrentURL,
c.baseURL.Hostname(),
err,
)
return
}
if !hasEqualDomain {
return
}
// get normalised version of rawCurrentURL
normalisedCurrentURL, err := util.NormaliseURL(rawCurrentURL)
if err != nil {
@ -81,9 +68,25 @@ func (c *Crawler) Crawl(rawCurrentURL string) {
return
}
isInternalLink, err := c.isInternalLink(rawCurrentURL)
if err != nil {
fmt.Printf(
"WARNING: Unable to determine if %q is an internal link; %v.\n",
rawCurrentURL,
err,
)
return
}
// Add (or update) a record of the URL in the pages map.
// If there's already an entry of the URL in the map then return early.
if existed := c.AddPageVisit(normalisedCurrentURL); existed {
if existed := c.addPageVisit(normalisedCurrentURL, isInternalLink); existed {
return
}
// if current URL is an external link then return early.
if !isInternalLink {
return
}
@ -119,7 +122,10 @@ func (c *Crawler) Crawl(rawCurrentURL string) {
}
}
func (c *Crawler) HasEqualDomain(rawURL string) (bool, error) {
// isInternalLink evaluates whether the input URL is an internal link to the
// base URL. An internal link is determined by comparing the host names of both
// the input and base URLs.
func (c *Crawler) isInternalLink(rawURL string) (bool, error) {
parsedRawURL, err := url.Parse(rawURL)
if err != nil {
return false, fmt.Errorf("error parsing the URL %q: %w", rawURL, err)
@ -132,16 +138,21 @@ func (c *Crawler) HasEqualDomain(rawURL string) (bool, error) {
// If there is already a record of the URL then it's record is updated (incremented)
// and the method returns true. If the URL is not already recorded then it is created
// and the method returns false.
func (c *Crawler) AddPageVisit(normalisedURL string) bool {
func (c *Crawler) addPageVisit(normalisedURL string, internal bool) bool {
c.mu.Lock()
defer c.mu.Unlock()
_, exists := c.pages[normalisedURL]
if exists {
c.pages[normalisedURL]++
stat := c.pages[normalisedURL]
stat.count++
c.pages[normalisedURL] = stat
} else {
c.pages[normalisedURL] = 1
c.pages[normalisedURL] = pageStat{
count: 1,
internal: internal,
}
}
return exists
@ -155,7 +166,7 @@ func (c *Crawler) PrintReport() {
c.mu.Lock()
defer c.mu.Unlock()
r := report.NewReport(c.baseURL.String(), c.pages)
r := newReport(c.baseURL.String(), c.pages)
fmt.Fprint(os.Stdout, r)
}

View file

@ -1,18 +1,17 @@
package crawler_test
package crawler
import (
"fmt"
"slices"
"testing"
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
)
func TestCrawler(t *testing.T) {
testBaseURL := "https://example.com"
testCrawler, err := crawler.NewCrawler(testBaseURL, 1, 10)
testCrawler, err := NewCrawler(testBaseURL, 1, 10)
if err != nil {
t.Fatalf("Test 'TestCrawler' FAILED: unexpected error creating the crawler: %v", err)
}
@ -50,7 +49,7 @@ func TestCrawler(t *testing.T) {
}
for ind, tc := range slices.All(testCasesForEqualDomains) {
t.Run(tc.name, testHasEqualDomains(
t.Run(tc.name, testIsInternalLink(
testCrawler,
ind+1,
tc.name,
@ -83,7 +82,7 @@ func TestCrawler(t *testing.T) {
for ind, tc := range slices.All(testCasesForPages) {
name := fmt.Sprintf("Adding %s to the pages map", tc.rawURL)
t.Run(name, testAddPageVisit(
t.Run(name, testHasVisited(
testCrawler,
ind+1,
name,
@ -93,8 +92,8 @@ func TestCrawler(t *testing.T) {
}
}
func testHasEqualDomains(
testCrawler *crawler.Crawler,
func testIsInternalLink(
testCrawler *Crawler,
testNum int,
testName string,
rawURL string,
@ -103,7 +102,7 @@ func testHasEqualDomains(
return func(t *testing.T) {
t.Parallel()
got, err := testCrawler.HasEqualDomain(rawURL)
got, err := testCrawler.isInternalLink(rawURL)
if err != nil {
t.Fatalf(
"Test %d - '%s' FAILED: unexpected error: %v",
@ -132,8 +131,8 @@ func testHasEqualDomains(
}
}
func testAddPageVisit(
testCrawler *crawler.Crawler,
func testHasVisited(
testCrawler *Crawler,
testNum int,
testName string,
rawURL string,
@ -150,7 +149,7 @@ func testAddPageVisit(
)
}
gotVisited := testCrawler.AddPageVisit(normalisedURL)
gotVisited := testCrawler.addPageVisit(normalisedURL, true)
if gotVisited != wantVisited {
t.Errorf(

View file

@ -0,0 +1,83 @@
package crawler
import (
"cmp"
"maps"
"slices"
"strconv"
"strings"
)
type report struct {
baseURL string
records []record
}
type record struct {
link string
count int
internal bool
}
func newReport(baseURL string, pages map[string]pageStat) report {
records := make([]record, 0)
for link, stats := range maps.All(pages) {
record := record{
link: link,
count: stats.count,
internal: stats.internal,
}
records = append(records, record)
}
report := report{
baseURL: baseURL,
records: records,
}
report.sortRecords()
return report
}
func (r *report) sortRecords() {
// First sort records by count (in reverse order hopefully)
// Then sort records by name if two elements have the same count.
slices.SortFunc(r.records, func(a, b record) int {
if n := cmp.Compare(a.count, b.count); n != 0 {
return -1 * n
}
return strings.Compare(a.link, b.link)
})
}
func (r report) String() string {
var builder strings.Builder
titlebar := strings.Repeat("\u2500", 80)
builder.WriteString("\n" + titlebar)
builder.WriteString("\n" + "REPORT for " + r.baseURL)
builder.WriteString("\n" + titlebar)
for ind := range slices.All(r.records) {
linkType := "internal"
if !r.records[ind].internal {
linkType = "external"
}
links := "links"
if r.records[ind].count == 1 {
links = "link"
}
builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " " + linkType + " " + links + " to " + r.records[ind].link)
}
builder.WriteString("\n")
return builder.String()
}

View file

@ -1,66 +0,0 @@
package report
import (
"cmp"
"maps"
"slices"
"strconv"
"strings"
)
type Report struct {
baseURL string
records []Record
}
type Record struct {
link string
count int
}
func NewReport(baseURL string, pages map[string]int) Report {
records := make([]Record, 0)
for link, count := range maps.All(pages) {
records = append(records, Record{link: link, count: count})
}
report := Report{
baseURL: baseURL,
records: records,
}
report.SortRecords()
return report
}
func (r *Report) SortRecords() {
// First sort records by count (in reverse order hopefully)
// Then sort records by name if two elements have the same count.
slices.SortFunc(r.records, func(a, b Record) int {
if n := cmp.Compare(a.count, b.count); n != 0 {
return -1 * n
}
return strings.Compare(a.link, b.link)
})
}
func (r Report) String() string {
var builder strings.Builder
titlebar := strings.Repeat("\u2500", 80)
builder.WriteString("\n" + titlebar)
builder.WriteString("\n" + "REPORT for " + r.baseURL)
builder.WriteString("\n" + titlebar)
for ind := range slices.All(r.records) {
builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " internal links to " + r.records[ind].link)
}
builder.WriteString("\n")
return builder.String()
}