generated from templates/go-generic
feat: add external links to the report #3
5 changed files with 140 additions and 107 deletions
14
README.md
14
README.md
|
@ -2,7 +2,7 @@
|
|||
|
||||
## Overview
|
||||
|
||||
This web crawler crawls a given URL and generates a report for all the internal links it finds.
|
||||
This web crawler crawls a given website and generates a report for all the internal and external links found during the crawl.
|
||||
|
||||
### Repository mirrors
|
||||
|
||||
|
@ -21,9 +21,15 @@ git clone https://github.com/dananglin/web-crawler.git
|
|||
```
|
||||
|
||||
Build the application.
|
||||
```
|
||||
go build -o crawler .
|
||||
```
|
||||
|
||||
- Build with go
|
||||
```
|
||||
go build -o crawler .
|
||||
```
|
||||
- Or build with [mage](https://magefile.org/) if you have it installed.
|
||||
```
|
||||
mage build
|
||||
```
|
||||
|
||||
Run the application specifying the website that you want to crawl.
|
||||
|
||||
|
|
|
@ -6,12 +6,11 @@ import (
|
|||
"os"
|
||||
"sync"
|
||||
|
||||
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/report"
|
||||
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
|
||||
)
|
||||
|
||||
type Crawler struct {
|
||||
pages map[string]int
|
||||
pages map[string]pageStat
|
||||
baseURL *url.URL
|
||||
mu *sync.Mutex
|
||||
workerPool chan struct{}
|
||||
|
@ -19,6 +18,11 @@ type Crawler struct {
|
|||
maxPages int
|
||||
}
|
||||
|
||||
type pageStat struct {
|
||||
count int
|
||||
internal bool
|
||||
}
|
||||
|
||||
func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
|
||||
baseURL, err := url.Parse(rawBaseURL)
|
||||
if err != nil {
|
||||
|
@ -30,7 +34,7 @@ func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) {
|
|||
waitGroup.Add(1)
|
||||
|
||||
crawler := Crawler{
|
||||
pages: make(map[string]int),
|
||||
pages: make(map[string]pageStat),
|
||||
baseURL: baseURL,
|
||||
mu: &sync.Mutex{},
|
||||
workerPool: make(chan struct{}, maxWorkers),
|
||||
|
@ -56,23 +60,6 @@ func (c *Crawler) Crawl(rawCurrentURL string) {
|
|||
return
|
||||
}
|
||||
|
||||
// if current URL is not on the same domain as the base URL then return early.
|
||||
hasEqualDomain, err := c.HasEqualDomain(rawCurrentURL)
|
||||
if err != nil {
|
||||
fmt.Printf(
|
||||
"WARNING: Unable to determine if %q has the same domain as %q; %v.\n",
|
||||
rawCurrentURL,
|
||||
c.baseURL.Hostname(),
|
||||
err,
|
||||
)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
if !hasEqualDomain {
|
||||
return
|
||||
}
|
||||
|
||||
// get normalised version of rawCurrentURL
|
||||
normalisedCurrentURL, err := util.NormaliseURL(rawCurrentURL)
|
||||
if err != nil {
|
||||
|
@ -81,9 +68,25 @@ func (c *Crawler) Crawl(rawCurrentURL string) {
|
|||
return
|
||||
}
|
||||
|
||||
isInternalLink, err := c.isInternalLink(rawCurrentURL)
|
||||
if err != nil {
|
||||
fmt.Printf(
|
||||
"WARNING: Unable to determine if %q is an internal link; %v.\n",
|
||||
rawCurrentURL,
|
||||
err,
|
||||
)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// Add (or update) a record of the URL in the pages map.
|
||||
// If there's already an entry of the URL in the map then return early.
|
||||
if existed := c.AddPageVisit(normalisedCurrentURL); existed {
|
||||
if existed := c.addPageVisit(normalisedCurrentURL, isInternalLink); existed {
|
||||
return
|
||||
}
|
||||
|
||||
// if current URL is an external link then return early.
|
||||
if !isInternalLink {
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -119,7 +122,10 @@ func (c *Crawler) Crawl(rawCurrentURL string) {
|
|||
}
|
||||
}
|
||||
|
||||
func (c *Crawler) HasEqualDomain(rawURL string) (bool, error) {
|
||||
// isInternalLink evaluates whether the input URL is an internal link to the
|
||||
// base URL. An internal link is determined by comparing the host names of both
|
||||
// the input and base URLs.
|
||||
func (c *Crawler) isInternalLink(rawURL string) (bool, error) {
|
||||
parsedRawURL, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("error parsing the URL %q: %w", rawURL, err)
|
||||
|
@ -132,16 +138,21 @@ func (c *Crawler) HasEqualDomain(rawURL string) (bool, error) {
|
|||
// If there is already a record of the URL then it's record is updated (incremented)
|
||||
// and the method returns true. If the URL is not already recorded then it is created
|
||||
// and the method returns false.
|
||||
func (c *Crawler) AddPageVisit(normalisedURL string) bool {
|
||||
func (c *Crawler) addPageVisit(normalisedURL string, internal bool) bool {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
_, exists := c.pages[normalisedURL]
|
||||
|
||||
if exists {
|
||||
c.pages[normalisedURL]++
|
||||
stat := c.pages[normalisedURL]
|
||||
stat.count++
|
||||
c.pages[normalisedURL] = stat
|
||||
} else {
|
||||
c.pages[normalisedURL] = 1
|
||||
c.pages[normalisedURL] = pageStat{
|
||||
count: 1,
|
||||
internal: internal,
|
||||
}
|
||||
}
|
||||
|
||||
return exists
|
||||
|
@ -155,7 +166,7 @@ func (c *Crawler) PrintReport() {
|
|||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
r := report.NewReport(c.baseURL.String(), c.pages)
|
||||
r := newReport(c.baseURL.String(), c.pages)
|
||||
|
||||
fmt.Fprint(os.Stdout, r)
|
||||
}
|
||||
|
|
|
@ -1,18 +1,17 @@
|
|||
package crawler_test
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"slices"
|
||||
"testing"
|
||||
|
||||
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
|
||||
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
|
||||
)
|
||||
|
||||
func TestCrawler(t *testing.T) {
|
||||
testBaseURL := "https://example.com"
|
||||
|
||||
testCrawler, err := crawler.NewCrawler(testBaseURL, 1, 10)
|
||||
testCrawler, err := NewCrawler(testBaseURL, 1, 10)
|
||||
if err != nil {
|
||||
t.Fatalf("Test 'TestCrawler' FAILED: unexpected error creating the crawler: %v", err)
|
||||
}
|
||||
|
@ -50,7 +49,7 @@ func TestCrawler(t *testing.T) {
|
|||
}
|
||||
|
||||
for ind, tc := range slices.All(testCasesForEqualDomains) {
|
||||
t.Run(tc.name, testHasEqualDomains(
|
||||
t.Run(tc.name, testIsInternalLink(
|
||||
testCrawler,
|
||||
ind+1,
|
||||
tc.name,
|
||||
|
@ -83,7 +82,7 @@ func TestCrawler(t *testing.T) {
|
|||
|
||||
for ind, tc := range slices.All(testCasesForPages) {
|
||||
name := fmt.Sprintf("Adding %s to the pages map", tc.rawURL)
|
||||
t.Run(name, testAddPageVisit(
|
||||
t.Run(name, testHasVisited(
|
||||
testCrawler,
|
||||
ind+1,
|
||||
name,
|
||||
|
@ -93,8 +92,8 @@ func TestCrawler(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func testHasEqualDomains(
|
||||
testCrawler *crawler.Crawler,
|
||||
func testIsInternalLink(
|
||||
testCrawler *Crawler,
|
||||
testNum int,
|
||||
testName string,
|
||||
rawURL string,
|
||||
|
@ -103,7 +102,7 @@ func testHasEqualDomains(
|
|||
return func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got, err := testCrawler.HasEqualDomain(rawURL)
|
||||
got, err := testCrawler.isInternalLink(rawURL)
|
||||
if err != nil {
|
||||
t.Fatalf(
|
||||
"Test %d - '%s' FAILED: unexpected error: %v",
|
||||
|
@ -132,8 +131,8 @@ func testHasEqualDomains(
|
|||
}
|
||||
}
|
||||
|
||||
func testAddPageVisit(
|
||||
testCrawler *crawler.Crawler,
|
||||
func testHasVisited(
|
||||
testCrawler *Crawler,
|
||||
testNum int,
|
||||
testName string,
|
||||
rawURL string,
|
||||
|
@ -150,7 +149,7 @@ func testAddPageVisit(
|
|||
)
|
||||
}
|
||||
|
||||
gotVisited := testCrawler.AddPageVisit(normalisedURL)
|
||||
gotVisited := testCrawler.addPageVisit(normalisedURL, true)
|
||||
|
||||
if gotVisited != wantVisited {
|
||||
t.Errorf(
|
||||
|
|
83
internal/crawler/report.go
Normal file
83
internal/crawler/report.go
Normal file
|
@ -0,0 +1,83 @@
|
|||
package crawler
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"maps"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type report struct {
|
||||
baseURL string
|
||||
records []record
|
||||
}
|
||||
|
||||
type record struct {
|
||||
link string
|
||||
count int
|
||||
internal bool
|
||||
}
|
||||
|
||||
func newReport(baseURL string, pages map[string]pageStat) report {
|
||||
records := make([]record, 0)
|
||||
|
||||
for link, stats := range maps.All(pages) {
|
||||
record := record{
|
||||
link: link,
|
||||
count: stats.count,
|
||||
internal: stats.internal,
|
||||
}
|
||||
|
||||
records = append(records, record)
|
||||
}
|
||||
|
||||
report := report{
|
||||
baseURL: baseURL,
|
||||
records: records,
|
||||
}
|
||||
|
||||
report.sortRecords()
|
||||
|
||||
return report
|
||||
}
|
||||
|
||||
func (r *report) sortRecords() {
|
||||
// First sort records by count (in reverse order hopefully)
|
||||
// Then sort records by name if two elements have the same count.
|
||||
slices.SortFunc(r.records, func(a, b record) int {
|
||||
if n := cmp.Compare(a.count, b.count); n != 0 {
|
||||
return -1 * n
|
||||
}
|
||||
|
||||
return strings.Compare(a.link, b.link)
|
||||
})
|
||||
}
|
||||
|
||||
func (r report) String() string {
|
||||
var builder strings.Builder
|
||||
|
||||
titlebar := strings.Repeat("\u2500", 80)
|
||||
|
||||
builder.WriteString("\n" + titlebar)
|
||||
builder.WriteString("\n" + "REPORT for " + r.baseURL)
|
||||
builder.WriteString("\n" + titlebar)
|
||||
|
||||
for ind := range slices.All(r.records) {
|
||||
linkType := "internal"
|
||||
if !r.records[ind].internal {
|
||||
linkType = "external"
|
||||
}
|
||||
|
||||
links := "links"
|
||||
if r.records[ind].count == 1 {
|
||||
links = "link"
|
||||
}
|
||||
|
||||
builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " " + linkType + " " + links + " to " + r.records[ind].link)
|
||||
}
|
||||
|
||||
builder.WriteString("\n")
|
||||
|
||||
return builder.String()
|
||||
}
|
|
@ -1,66 +0,0 @@
|
|||
package report
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"maps"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Report struct {
|
||||
baseURL string
|
||||
records []Record
|
||||
}
|
||||
|
||||
type Record struct {
|
||||
link string
|
||||
count int
|
||||
}
|
||||
|
||||
func NewReport(baseURL string, pages map[string]int) Report {
|
||||
records := make([]Record, 0)
|
||||
|
||||
for link, count := range maps.All(pages) {
|
||||
records = append(records, Record{link: link, count: count})
|
||||
}
|
||||
|
||||
report := Report{
|
||||
baseURL: baseURL,
|
||||
records: records,
|
||||
}
|
||||
|
||||
report.SortRecords()
|
||||
|
||||
return report
|
||||
}
|
||||
|
||||
func (r *Report) SortRecords() {
|
||||
// First sort records by count (in reverse order hopefully)
|
||||
// Then sort records by name if two elements have the same count.
|
||||
slices.SortFunc(r.records, func(a, b Record) int {
|
||||
if n := cmp.Compare(a.count, b.count); n != 0 {
|
||||
return -1 * n
|
||||
}
|
||||
|
||||
return strings.Compare(a.link, b.link)
|
||||
})
|
||||
}
|
||||
|
||||
func (r Report) String() string {
|
||||
var builder strings.Builder
|
||||
|
||||
titlebar := strings.Repeat("\u2500", 80)
|
||||
|
||||
builder.WriteString("\n" + titlebar)
|
||||
builder.WriteString("\n" + "REPORT for " + r.baseURL)
|
||||
builder.WriteString("\n" + titlebar)
|
||||
|
||||
for ind := range slices.All(r.records) {
|
||||
builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " internal links to " + r.records[ind].link)
|
||||
}
|
||||
|
||||
builder.WriteString("\n")
|
||||
|
||||
return builder.String()
|
||||
}
|
Loading…
Reference in a new issue