From 5498ac7b4e46f8f02dc21548fd61480ef59db829 Mon Sep 17 00:00:00 2001 From: Dan Anglin Date: Wed, 28 Aug 2024 07:39:24 +0100 Subject: [PATCH] feat: add external links to the report --- README.md | 14 ++++-- internal/crawler/crawler.go | 63 ++++++++++++++---------- internal/crawler/crawler_test.go | 21 ++++---- internal/crawler/report.go | 83 ++++++++++++++++++++++++++++++++ internal/report/report.go | 66 ------------------------- 5 files changed, 140 insertions(+), 107 deletions(-) create mode 100644 internal/crawler/report.go delete mode 100644 internal/report/report.go diff --git a/README.md b/README.md index 4a6813c..d603396 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## Overview -This web crawler crawls a given URL and generates a report for all the internal links it finds. +This web crawler crawls a given website and generates a report for all the internal and external links found during the crawl. ### Repository mirrors @@ -21,9 +21,15 @@ git clone https://github.com/dananglin/web-crawler.git ``` Build the application. -``` -go build -o crawler . -``` + +- Build with go + ``` + go build -o crawler . + ``` +- Or build with [mage](https://magefile.org/) if you have it installed. + ``` + mage build + ``` Run the application specifying the website that you want to crawl. diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go index c4a62ec..74dddba 100644 --- a/internal/crawler/crawler.go +++ b/internal/crawler/crawler.go @@ -6,12 +6,11 @@ import ( "os" "sync" - "codeflow.dananglin.me.uk/apollo/web-crawler/internal/report" "codeflow.dananglin.me.uk/apollo/web-crawler/internal/util" ) type Crawler struct { - pages map[string]int + pages map[string]pageStat baseURL *url.URL mu *sync.Mutex workerPool chan struct{} @@ -19,6 +18,11 @@ type Crawler struct { maxPages int } +type pageStat struct { + count int + internal bool +} + func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) { baseURL, err := url.Parse(rawBaseURL) if err != nil { @@ -30,7 +34,7 @@ func NewCrawler(rawBaseURL string, maxWorkers, maxPages int) (*Crawler, error) { waitGroup.Add(1) crawler := Crawler{ - pages: make(map[string]int), + pages: make(map[string]pageStat), baseURL: baseURL, mu: &sync.Mutex{}, workerPool: make(chan struct{}, maxWorkers), @@ -56,23 +60,6 @@ func (c *Crawler) Crawl(rawCurrentURL string) { return } - // if current URL is not on the same domain as the base URL then return early. - hasEqualDomain, err := c.HasEqualDomain(rawCurrentURL) - if err != nil { - fmt.Printf( - "WARNING: Unable to determine if %q has the same domain as %q; %v.\n", - rawCurrentURL, - c.baseURL.Hostname(), - err, - ) - - return - } - - if !hasEqualDomain { - return - } - // get normalised version of rawCurrentURL normalisedCurrentURL, err := util.NormaliseURL(rawCurrentURL) if err != nil { @@ -81,9 +68,25 @@ func (c *Crawler) Crawl(rawCurrentURL string) { return } + isInternalLink, err := c.isInternalLink(rawCurrentURL) + if err != nil { + fmt.Printf( + "WARNING: Unable to determine if %q is an internal link; %v.\n", + rawCurrentURL, + err, + ) + + return + } + // Add (or update) a record of the URL in the pages map. // If there's already an entry of the URL in the map then return early. - if existed := c.AddPageVisit(normalisedCurrentURL); existed { + if existed := c.addPageVisit(normalisedCurrentURL, isInternalLink); existed { + return + } + + // if current URL is an external link then return early. + if !isInternalLink { return } @@ -119,7 +122,10 @@ func (c *Crawler) Crawl(rawCurrentURL string) { } } -func (c *Crawler) HasEqualDomain(rawURL string) (bool, error) { +// isInternalLink evaluates whether the input URL is an internal link to the +// base URL. An internal link is determined by comparing the host names of both +// the input and base URLs. +func (c *Crawler) isInternalLink(rawURL string) (bool, error) { parsedRawURL, err := url.Parse(rawURL) if err != nil { return false, fmt.Errorf("error parsing the URL %q: %w", rawURL, err) @@ -132,16 +138,21 @@ func (c *Crawler) HasEqualDomain(rawURL string) (bool, error) { // If there is already a record of the URL then it's record is updated (incremented) // and the method returns true. If the URL is not already recorded then it is created // and the method returns false. -func (c *Crawler) AddPageVisit(normalisedURL string) bool { +func (c *Crawler) addPageVisit(normalisedURL string, internal bool) bool { c.mu.Lock() defer c.mu.Unlock() _, exists := c.pages[normalisedURL] if exists { - c.pages[normalisedURL]++ + stat := c.pages[normalisedURL] + stat.count++ + c.pages[normalisedURL] = stat } else { - c.pages[normalisedURL] = 1 + c.pages[normalisedURL] = pageStat{ + count: 1, + internal: internal, + } } return exists @@ -155,7 +166,7 @@ func (c *Crawler) PrintReport() { c.mu.Lock() defer c.mu.Unlock() - r := report.NewReport(c.baseURL.String(), c.pages) + r := newReport(c.baseURL.String(), c.pages) fmt.Fprint(os.Stdout, r) } diff --git a/internal/crawler/crawler_test.go b/internal/crawler/crawler_test.go index 4ccfac8..48c17ff 100644 --- a/internal/crawler/crawler_test.go +++ b/internal/crawler/crawler_test.go @@ -1,18 +1,17 @@ -package crawler_test +package crawler import ( "fmt" "slices" "testing" - "codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler" "codeflow.dananglin.me.uk/apollo/web-crawler/internal/util" ) func TestCrawler(t *testing.T) { testBaseURL := "https://example.com" - testCrawler, err := crawler.NewCrawler(testBaseURL, 1, 10) + testCrawler, err := NewCrawler(testBaseURL, 1, 10) if err != nil { t.Fatalf("Test 'TestCrawler' FAILED: unexpected error creating the crawler: %v", err) } @@ -50,7 +49,7 @@ func TestCrawler(t *testing.T) { } for ind, tc := range slices.All(testCasesForEqualDomains) { - t.Run(tc.name, testHasEqualDomains( + t.Run(tc.name, testIsInternalLink( testCrawler, ind+1, tc.name, @@ -83,7 +82,7 @@ func TestCrawler(t *testing.T) { for ind, tc := range slices.All(testCasesForPages) { name := fmt.Sprintf("Adding %s to the pages map", tc.rawURL) - t.Run(name, testAddPageVisit( + t.Run(name, testHasVisited( testCrawler, ind+1, name, @@ -93,8 +92,8 @@ func TestCrawler(t *testing.T) { } } -func testHasEqualDomains( - testCrawler *crawler.Crawler, +func testIsInternalLink( + testCrawler *Crawler, testNum int, testName string, rawURL string, @@ -103,7 +102,7 @@ func testHasEqualDomains( return func(t *testing.T) { t.Parallel() - got, err := testCrawler.HasEqualDomain(rawURL) + got, err := testCrawler.isInternalLink(rawURL) if err != nil { t.Fatalf( "Test %d - '%s' FAILED: unexpected error: %v", @@ -132,8 +131,8 @@ func testHasEqualDomains( } } -func testAddPageVisit( - testCrawler *crawler.Crawler, +func testHasVisited( + testCrawler *Crawler, testNum int, testName string, rawURL string, @@ -150,7 +149,7 @@ func testAddPageVisit( ) } - gotVisited := testCrawler.AddPageVisit(normalisedURL) + gotVisited := testCrawler.addPageVisit(normalisedURL, true) if gotVisited != wantVisited { t.Errorf( diff --git a/internal/crawler/report.go b/internal/crawler/report.go new file mode 100644 index 0000000..3847512 --- /dev/null +++ b/internal/crawler/report.go @@ -0,0 +1,83 @@ +package crawler + +import ( + "cmp" + "maps" + "slices" + "strconv" + "strings" +) + +type report struct { + baseURL string + records []record +} + +type record struct { + link string + count int + internal bool +} + +func newReport(baseURL string, pages map[string]pageStat) report { + records := make([]record, 0) + + for link, stats := range maps.All(pages) { + record := record{ + link: link, + count: stats.count, + internal: stats.internal, + } + + records = append(records, record) + } + + report := report{ + baseURL: baseURL, + records: records, + } + + report.sortRecords() + + return report +} + +func (r *report) sortRecords() { + // First sort records by count (in reverse order hopefully) + // Then sort records by name if two elements have the same count. + slices.SortFunc(r.records, func(a, b record) int { + if n := cmp.Compare(a.count, b.count); n != 0 { + return -1 * n + } + + return strings.Compare(a.link, b.link) + }) +} + +func (r report) String() string { + var builder strings.Builder + + titlebar := strings.Repeat("\u2500", 80) + + builder.WriteString("\n" + titlebar) + builder.WriteString("\n" + "REPORT for " + r.baseURL) + builder.WriteString("\n" + titlebar) + + for ind := range slices.All(r.records) { + linkType := "internal" + if !r.records[ind].internal { + linkType = "external" + } + + links := "links" + if r.records[ind].count == 1 { + links = "link" + } + + builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " " + linkType + " " + links + " to " + r.records[ind].link) + } + + builder.WriteString("\n") + + return builder.String() +} diff --git a/internal/report/report.go b/internal/report/report.go deleted file mode 100644 index b28a393..0000000 --- a/internal/report/report.go +++ /dev/null @@ -1,66 +0,0 @@ -package report - -import ( - "cmp" - "maps" - "slices" - "strconv" - "strings" -) - -type Report struct { - baseURL string - records []Record -} - -type Record struct { - link string - count int -} - -func NewReport(baseURL string, pages map[string]int) Report { - records := make([]Record, 0) - - for link, count := range maps.All(pages) { - records = append(records, Record{link: link, count: count}) - } - - report := Report{ - baseURL: baseURL, - records: records, - } - - report.SortRecords() - - return report -} - -func (r *Report) SortRecords() { - // First sort records by count (in reverse order hopefully) - // Then sort records by name if two elements have the same count. - slices.SortFunc(r.records, func(a, b Record) int { - if n := cmp.Compare(a.count, b.count); n != 0 { - return -1 * n - } - - return strings.Compare(a.link, b.link) - }) -} - -func (r Report) String() string { - var builder strings.Builder - - titlebar := strings.Repeat("\u2500", 80) - - builder.WriteString("\n" + titlebar) - builder.WriteString("\n" + "REPORT for " + r.baseURL) - builder.WriteString("\n" + titlebar) - - for ind := range slices.All(r.records) { - builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " internal links to " + r.records[ind].link) - } - - builder.WriteString("\n") - - return builder.String() -} -- 2.45.2