feat: add the web crawler

Add the source code for the web crawler. The web crawler is a simple Go CLI application that traverses through a website and generates a report of all the internal links found in the site.
2024-08-27 15:42:26 +01:00 · 2024-08-27 15:42:26 +01:00 · 4519de764e
commit 4519de764e
parent 5d447923b1
25 changed files with 935 additions and 60 deletions
--- a/.forgejo/actions/mage/Dockerfile
+++ b/.forgejo/actions/mage/Dockerfile
@ -0,0 +1,6 @@
 # syntax=docker/dockerfile:1
 FROM golang:1.23.0
 RUN go install github.com/magefile/mage@v1.15.0
 ENTRYPOINT ["mage"]
--- a/.forgejo/actions/mage/action.yaml
+++ b/.forgejo/actions/mage/action.yaml
@ -0,0 +1,16 @@
 ---
 name: "Mage Action"
 description: "Runs a mage target in the defined in the project's repository"
 inputs:
  target:
    description: "The mage target to run"
    required: true
 runs:
  using: "docker"
  image: "Dockerfile"
  entrypoint: "mage"
  args:
  - -v
  - ${{ inputs.target }}
--- a/.forgejo/workflows/Tests.yaml
+++ b/.forgejo/workflows/Tests.yaml
@ -0,0 +1,23 @@
 ---
 name: Tests
 on:
  pull_request:
    types:
    - opened
    - synchronize
 jobs:
  test:
    if: ${{ ! github.event.pull_request.draft }}
    runs-on: docker
    steps:
    - name: Checkout Repository
      uses: https://code.forgejo.org/actions/checkout@v4
    - name: Test
      uses: ./.forgejo/actions/mage
      with:
        target: test
      env:
        CRAWLER_TEST_COVER: "1"
        CRAWLER_TEST_VERBOSE: "1"
--- a/.forgejo/workflows/workflow.yaml
+++ b/.forgejo/workflows/workflow.yaml
@ -1,37 +0,0 @@
 ---
 on:
  pull_request:
    types:
    - opened
    - reopened
    - synchronize
 jobs:
  test:
    runs-on: docker
    env:
      GO_TEST_VERBOSE: "1"
      GO_TEST_COVER: "1"
    steps:
    - name: Checkout Repository
      uses: https://code.forgejo.org/actions/checkout@v4
    - name: Setup Go
      uses: https://code.forgejo.org/actions/setup-go@v5
      with:
        go-version: '1.22'
    - name: Test
      run: go run magefiles/main.go -v test
  lint:
    runs-on: docker
    steps:
    - name: Checkout Repository
      uses: https://code.forgejo.org/actions/checkout@v4
    - name: Setup Go
      uses: https://code.forgejo.org/actions/setup-go@v5
      with:
        go-version: '1.22'
    - name: Lint
      uses: https://github.com/golangci/golangci-lint-action@v3
      with:
        version: v1.54
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1 @@
-/__build/*
+/crawler
 !__build/.gitkeep
--- a/.golangci.yaml
+++ b/.golangci.yaml
@ -6,7 +6,7 @@ run:
  tests: true
 output:
-  format: colored-line-number
+  formats: colored-line-number
  print-issues-lines: true
  print-linter-name: true
  uniq-by-line: true
@ -18,5 +18,7 @@ linters-settings:
 linters:
  enable-all: true
-  # disable:
+  disable:
  - execinquery
  - gomnd
  fast: false
--- a/21
+++ b/21
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2024 Dan Anglin
 Permission is hereby granted, free of charge, to any person obtaining a copy of this
 software and associated documentation files (the “Software”), to deal in the Software
 without restriction, including without limitation the rights to use, copy, modify, merge,
 publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
 to whom the Software is furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all copies or
 substantial portions of the Software.
 THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/__build/.gitkeep
+++ b/__build/.gitkeep
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,5 @@
 module codeflow.dananglin.me.uk/apollo/web-crawler
 go 1.23.0
 require golang.org/x/net v0.28.0
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,2 @@
 golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE=
 golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg=
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@ -0,0 +1,168 @@
 package crawler
 import (
 	"fmt"
 	"net/url"
 	"os"
 	"sync"
 	"codeflow.dananglin.me.uk/apollo/web-crawler/internal/report"
 	"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
 )
 type Crawler struct {
 	pages              map[string]int
 	baseURL            *url.URL
 	mu                 *sync.Mutex
 	concurrencyControl chan struct{}
 	wg                 *sync.WaitGroup
 	maxPages           int
 }
 func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, error) {
 	baseURL, err := url.Parse(rawBaseURL)
 	if err != nil {
 		return nil, fmt.Errorf("unable to parse the base URL: %w", err)
 	}
 	var waitGroup sync.WaitGroup
 	waitGroup.Add(1)
 	crawler := Crawler{
 		pages:              make(map[string]int),
 		baseURL:            baseURL,
 		mu:                 &sync.Mutex{},
 		concurrencyControl: make(chan struct{}, maxConcurrency),
 		wg:                 &waitGroup,
 		maxPages:           maxPages,
 	}
 	return &crawler, nil
 }
 func (c *Crawler) Crawl(rawCurrentURL string) {
 	// Add an empty struct to channel here
 	c.concurrencyControl <- struct{}{}
 	// Decrement the wait group counter and free up the channel when finished
 	// crawling.
 	defer func() {
 		<-c.concurrencyControl
 		c.wg.Done()
 	}()
 	if c.reachedMaxPages() {
 		return
 	}
 	// if current URL is not on the same domain as the base URL then return early.
 	hasEqualDomain, err := c.HasEqualDomain(rawCurrentURL)
 	if err != nil {
 		fmt.Printf(
 			"WARNING: Unable to determine if %q has the same domain as %q; %v.\n",
 			rawCurrentURL,
 			c.baseURL.Hostname(),
 			err,
 		)
 		return
 	}
 	if !hasEqualDomain {
 		return
 	}
 	// get normalised version of rawCurrentURL
 	normalisedCurrentURL, err := util.NormaliseURL(rawCurrentURL)
 	if err != nil {
 		fmt.Printf("WARNING: Error normalising %q: %v.\n", rawCurrentURL, err)
 		return
 	}
 	// Add (or update) a record of the URL in the pages map.
 	// If there's already an entry of the URL in the map then return early.
 	if existed := c.AddPageVisit(normalisedCurrentURL); existed {
 		return
 	}
 	// Get the HTML from the current URL, print that you are getting the HTML doc from current URL.
 	fmt.Printf("Crawling %q\n", rawCurrentURL)
 	htmlDoc, err := getHTML(rawCurrentURL)
 	if err != nil {
 		fmt.Printf(
 			"WARNING: Error retrieving the HTML document from %q: %v.\n",
 			rawCurrentURL,
 			err,
 		)
 		return
 	}
 	// Get all the URLs from the HTML doc.
 	links, err := util.GetURLsFromHTML(htmlDoc, c.baseURL.String())
 	if err != nil {
 		fmt.Printf(
 			"WARNING: Error retrieving the links from the HTML document: %v.\n",
 			err,
 		)
 		return
 	}
 	// Recursively crawl each URL on the page.
 	for ind := range len(links) {
 		c.wg.Add(1)
 		go c.Crawl(links[ind])
 	}
 }
 func (c *Crawler) HasEqualDomain(rawURL string) (bool, error) {
 	parsedRawURL, err := url.Parse(rawURL)
 	if err != nil {
 		return false, fmt.Errorf("error parsing the URL %q: %w", rawURL, err)
 	}
 	return c.baseURL.Hostname() == parsedRawURL.Hostname(), nil
 }
 // addPageVisit adds a record of the visited page's URL to the pages map.
 // If there is already a record of the URL then it's record is updated (incremented)
 // and the method returns true. If the URL is not already recorded then it is created
 // and the method returns false.
 func (c *Crawler) AddPageVisit(normalisedURL string) bool {
 	c.mu.Lock()
 	defer c.mu.Unlock()
 	_, exists := c.pages[normalisedURL]
 	if exists {
 		c.pages[normalisedURL]++
 	} else {
 		c.pages[normalisedURL] = 1
 	}
 	return exists
 }
 func (c *Crawler) Wait() {
 	c.wg.Wait()
 }
 func (c *Crawler) PrintReport() {
 	c.mu.Lock()
 	defer c.mu.Unlock()
 	r := report.NewReport(c.baseURL.String(), c.pages)
 	fmt.Fprint(os.Stdout, r)
 }
 func (c *Crawler) reachedMaxPages() bool {
 	c.mu.Lock()
 	defer c.mu.Unlock()
 	return len(c.pages) >= c.maxPages
 }
--- a/internal/crawler/crawler_test.go
+++ b/internal/crawler/crawler_test.go
@ -0,0 +1,172 @@
 package crawler_test
 import (
 	"fmt"
 	"slices"
 	"testing"
 	"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
 	"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
 )
 func TestCrawler(t *testing.T) {
 	testBaseURL := "https://example.com"
 	testCrawler, err := crawler.NewCrawler(testBaseURL, 1, 10)
 	if err != nil {
 		t.Fatalf("Test 'TestCrawler' FAILED: unexpected error creating the crawler: %v", err)
 	}
 	testCasesForEqualDomains := []struct {
 		name   string
 		rawURL string
 		want   bool
 	}{
 		{
 			name:   "Same domain",
 			rawURL: "https://example.com",
 			want:   true,
 		},
 		{
 			name:   "Same domain, different path",
 			rawURL: "https://example.com/about/contact",
 			want:   true,
 		},
 		{
 			name:   "Same domain, different protocol",
 			rawURL: "http://example.com",
 			want:   true,
 		},
 		{
 			name:   "Different domain",
 			rawURL: "https://blog.person.me.uk",
 			want:   false,
 		},
 		{
 			name:   "Different domain, same path",
 			rawURL: "https://example.org/blog",
 			want:   false,
 		},
 	}
 	for ind, tc := range slices.All(testCasesForEqualDomains) {
 		t.Run(tc.name, testHasEqualDomains(
 			testCrawler,
 			ind+1,
 			tc.name,
 			tc.rawURL,
 			tc.want,
 		))
 	}
 	testCasesForPages := []struct {
 		rawURL      string
 		wantVisited bool
 	}{
 		{
 			rawURL:      "https://example.com/tags/linux",
 			wantVisited: false,
 		},
 		{
 			rawURL:      "https://example.com/blog",
 			wantVisited: false,
 		},
 		{
 			rawURL:      "https://example.com/about/contact.html",
 			wantVisited: false,
 		},
 		{
 			rawURL:      "https://example.com/blog",
 			wantVisited: true,
 		},
 	}
 	for ind, tc := range slices.All(testCasesForPages) {
 		name := fmt.Sprintf("Adding %s to the pages map", tc.rawURL)
 		t.Run(name, testAddPageVisit(
 			testCrawler,
 			ind+1,
 			name,
 			tc.rawURL,
 			tc.wantVisited,
 		))
 	}
 }
 func testHasEqualDomains(
 	testCrawler *crawler.Crawler,
 	testNum int,
 	testName string,
 	rawURL string,
 	want bool,
 ) func(t *testing.T) {
 	return func(t *testing.T) {
 		t.Parallel()
 		got, err := testCrawler.HasEqualDomain(rawURL)
 		if err != nil {
 			t.Fatalf(
 				"Test %d - '%s' FAILED: unexpected error: %v",
 				testNum,
 				testName,
 				err,
 			)
 		}
 		if got != want {
 			t.Errorf(
 				"Test %d - '%s' FAILED: unexpected domain comparison received: want %t, got %t",
 				testNum,
 				testName,
 				want,
 				got,
 			)
 		} else {
 			t.Logf(
 				"Test %d - '%s' PASSED: expected domain comparison received: got %t",
 				testNum,
 				testName,
 				got,
 			)
 		}
 	}
 }
 func testAddPageVisit(
 	testCrawler *crawler.Crawler,
 	testNum int,
 	testName string,
 	rawURL string,
 	wantVisited bool,
 ) func(t *testing.T) {
 	return func(t *testing.T) {
 		normalisedURL, err := util.NormaliseURL(rawURL)
 		if err != nil {
 			t.Fatalf(
 				"Test %d - '%s' FAILED: unexpected error: %v",
 				testNum,
 				testName,
 				err,
 			)
 		}
 		gotVisited := testCrawler.AddPageVisit(normalisedURL)
 		if gotVisited != wantVisited {
 			t.Errorf(
 				"Test %d - '%s' FAILED: unexpected bool returned after updated pages record: want %t, got %t",
 				testNum,
 				testName,
 				wantVisited,
 				gotVisited,
 			)
 		} else {
 			t.Logf(
 				"Test %d - '%s' PASSED: expected bool returned after updated pages record: got %t",
 				testNum,
 				testName,
 				gotVisited,
 			)
 		}
 	}
 }
--- a/internal/crawler/gethtml.go
+++ b/internal/crawler/gethtml.go
@ -0,0 +1,50 @@
 package crawler
 import (
 	"context"
 	"fmt"
 	"io"
 	"net/http"
 	"strings"
 	"time"
 )
 func getHTML(rawURL string) (string, error) {
 	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(10*time.Second))
 	defer cancel()
 	request, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
 	if err != nil {
 		return "", fmt.Errorf("error creating the HTTP request: %w", err)
 	}
 	client := http.Client{}
 	resp, err := client.Do(request)
 	if err != nil {
 		return "", fmt.Errorf("error getting the response: %w", err)
 	}
 	defer resp.Body.Close()
 	if resp.StatusCode >= 400 {
 		return "", fmt.Errorf(
 			"received a bad status from %s: (%d) %s",
 			rawURL,
 			resp.StatusCode,
 			resp.Status,
 		)
 	}
 	contentType := resp.Header.Get("content-type")
 	if !strings.Contains(contentType, "text/html") {
 		return "", fmt.Errorf("unexpected content type received: want text/html, got %s", contentType)
 	}
 	data, err := io.ReadAll(resp.Body)
 	if err != nil {
 		return "", fmt.Errorf("error reading the data from the response: %w", err)
 	}
 	return string(data), nil
 }
--- a/internal/report/report.go
+++ b/internal/report/report.go
@ -0,0 +1,66 @@
 package report
 import (
 	"cmp"
 	"maps"
 	"slices"
 	"strconv"
 	"strings"
 )
 type Report struct {
 	baseURL string
 	records []Record
 }
 type Record struct {
 	link  string
 	count int
 }
 func NewReport(baseURL string, pages map[string]int) Report {
 	records := make([]Record, 0)
 	for link, count := range maps.All(pages) {
 		records = append(records, Record{link: link, count: count})
 	}
 	report := Report{
 		baseURL: baseURL,
 		records: records,
 	}
 	report.SortRecords()
 	return report
 }
 func (r *Report) SortRecords() {
 	// First sort records by count (in reverse order hopefully)
 	// Then sort records by name if two elements have the same count.
 	slices.SortFunc(r.records, func(a, b Record) int {
 		if n := cmp.Compare(a.count, b.count); n != 0 {
 			return -1 * n
 		}
 		return strings.Compare(a.link, b.link)
 	})
 }
 func (r Report) String() string {
 	var builder strings.Builder
 	titlebar := strings.Repeat("\u2500", 80)
 	builder.WriteString("\n" + titlebar)
 	builder.WriteString("\n" + "REPORT for " + r.baseURL)
 	builder.WriteString("\n" + titlebar)
 	for ind := range slices.All(r.records) {
 		builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " internal links to " + r.records[ind].link)
 	}
 	builder.WriteString("\n")
 	return builder.String()
 }
--- a/internal/util/html.go
+++ b/internal/util/html.go
@ -0,0 +1,70 @@
 package util
 import (
 	"fmt"
 	"net/url"
 	"strings"
 	"golang.org/x/net/html"
 )
 func GetURLsFromHTML(htmlBody, rawBaseURL string) ([]string, error) {
 	htmlDoc, err := html.Parse(strings.NewReader(htmlBody))
 	if err != nil {
 		return []string{}, fmt.Errorf("unable to parse the HTML document: %w", err)
 	}
 	parsedRawBaseURL, err := url.Parse(rawBaseURL)
 	if err != nil {
 		return []string{}, fmt.Errorf("unable to parse the raw base URL %q: %w", rawBaseURL, err)
 	}
 	output := make([]string, 0, 3)
 	var extractLinkFunc func(*html.Node) error
 	extractLinkFunc = func(node *html.Node) error {
 		if node.Type == html.ElementNode && node.Data == "a" {
 			for _, a := range node.Attr {
 				if a.Key == "href" {
 					extractedURL, err := getAbsoluteURL(a.Val, parsedRawBaseURL)
 					if err != nil {
 						return fmt.Errorf("unable to get the absolute URL of %s: %w", a.Val, err)
 					}
 					output = append(output, extractedURL)
 					break
 				}
 			}
 		}
 		for c := node.FirstChild; c != nil; c = c.NextSibling {
 			if err := extractLinkFunc(c); err != nil {
 				return err
 			}
 		}
 		return nil
 	}
 	if err := extractLinkFunc(htmlDoc); err != nil {
 		return []string{}, err
 	}
 	return output, nil
 }
 func getAbsoluteURL(inputURL string, baseURL *url.URL) (string, error) {
 	parsedURL, err := url.Parse(inputURL)
 	if err != nil {
 		return "", fmt.Errorf("unable to parse the URL from %s: %w", inputURL, err)
 	}
 	if parsedURL.Scheme == "" && parsedURL.Host == "" {
 		parsedURL.Scheme = baseURL.Scheme
 		parsedURL.Host = baseURL.Host
 	}
 	return parsedURL.String(), nil
 }
--- a/internal/util/html_test.go
+++ b/internal/util/html_test.go
@ -0,0 +1,99 @@
 package util_test
 import (
 	"os"
 	"reflect"
 	"slices"
 	"testing"
 	"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
 )
 func TestGetURLsFromHTML(t *testing.T) {
 	t.Parallel()
 	cases := []struct {
 		name     string
 		filepath string
 		baseURL  string
 		want     []string
 	}{
 		{
 			name:     "HTML documentation using blog.boot.dev",
 			filepath: "testdata/GetURLFromHTML/blog.boot.dev.html",
 			baseURL:  "https://blog.boot.dev",
 			want: []string{
 				"https://blog.boot.dev/path/one",
 				"https://other.com/path/one",
 			},
 		},
 		{
 			name:     "HTML documentation using https://ben-bartlett.me.uk",
 			filepath: "testdata/GetURLFromHTML/ben-bartlett.html",
 			baseURL:  "https://ben-bartlett.me.uk",
 			want: []string{
 				"https://ben-bartlett.me.uk",
 				"https://github.com/ben-bartlett",
 				"https://mastodon.ben-bartlett.me.uk",
 				"https://ben-bartlett.me.uk/blog",
 				"https://ben-bartlett.me.uk/projects/orange-juice",
 				"https://ben-bartlett.me.uk/projects/mustangs",
 				"https://ben-bartlett.me.uk/projects/honeycombs",
 			},
 		},
 		{
 			name:     "HTML documentation using https://simple.cooking",
 			filepath: "testdata/GetURLFromHTML/my-simple-cooking-website.html",
 			baseURL:  "https://simple.cooking",
 			want: []string{
 				"https://simple.cooking/recipes/sweet-n-sour-kung-pao-style-chicken",
 				"https://simple.cooking/recipes/beef-and-broccoli",
 				"https://simple.cooking/recipes/asian-glazed-salmon",
 				"https://simple.cooking/recipes/caesar-salad",
 				"https://simple.cooking/recipes/simple-tuna-salad",
 				"https://simple.cooking/recipes/wholemeal-pizza",
 				"https://simple.cooking/news",
 				"https://simple.cooking/about/contact",
 				"https://the-other-site.example.new/home",
 			},
 		},
 	}
 	for _, tc := range slices.All(cases) {
 		t.Run(tc.name, testGetURLsFromHTML(tc.filepath, tc.baseURL, tc.want))
 	}
 }
 func testGetURLsFromHTML(path, baseURL string, want []string) func(t *testing.T) {
 	failedTestPrefix := "Test TestGetURLsFromHTML FAILED:"
 	return func(t *testing.T) {
 		t.Parallel()
 		htmlDoc, err := os.ReadFile(path)
 		if err != nil {
 			t.Fatalf("%s unable to open read data from %s: %v", failedTestPrefix, path, err)
 		}
 		got, err := util.GetURLsFromHTML(string(htmlDoc), baseURL)
 		if err != nil {
 			t.Fatalf(
 				"Test TestGetURLsFromHTML FAILED: unexpected error: %v",
 				err,
 			)
 		}
 		if !reflect.DeepEqual(want, got) {
 			t.Errorf(
 				"Test TestGetURLsFromHTML FAILED: unexpected URLs found in HTML body: want %v, got %v",
 				want,
 				got,
 			)
 		} else {
 			t.Logf(
 				"Test TestGetURLsFromHTML PASSED: expected URLs found in HTML body: got %v",
 				got,
 			)
 		}
 	}
 }
--- a/internal/util/testdata/GetURLFromHTML/ben-bartlett.html
+++ b/internal/util/testdata/GetURLFromHTML/ben-bartlett.html
@ -0,0 +1,34 @@
 <!DOCTYPE html>
 <html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
 <head>
  <meta charset="utf-8" />
  <title>Ben Bartlett</title>
  <style>
    code{white-space: pre-wrap;}
    span.smallcaps{font-variant: small-caps;}
    span.underline{text-decoration: underline;}
    div.column{display: inline-block; vertical-align: top; width: 50%;}
    div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
    ul.task-list{list-style: none;}
  </style>
 </head>
 <body>
 <header id="title-block-header">
 <h1 class="title">Ben Bartlett</h1>
 </header>
 <p>Hey there! Ben Bartlett here. I am a Backend software engineer working in the healthcare industry. At night I am a hobbyist developer of 2D games. When I'm not coding I would find myself cooking, reading engaging novels, and going on the occasional hike or two.</p>
 <h2 id="my-links">My Links</h2>
 <ul>
 <li><a href="https://ben-bartlett.me.uk">My website</a></li>
 <li><a href="https://github.com/ben-bartlett">GitHub</a></li>
 <li><a href="https://mastodon.ben-bartlett.me.uk">Mastodon</a></li>
 <li><a href="/blog">My blog</a></li>
 </ul>
 <h2 id="projects-im-working-on">Projects I'm working on</h2>
 <ul>
 <li><a href="/projects/orange-juice">Orange Juice</a></li>
 <li><a href="/projects/mustangs">Mustangs</a></li>
 <li><a href="/projects/honeycombs">Honeycombs</a></li>
 </ul>
 </body>
 </html>
--- a/internal/util/testdata/GetURLFromHTML/blog.boot.dev.html
+++ b/internal/util/testdata/GetURLFromHTML/blog.boot.dev.html
@ -0,0 +1,10 @@
 <html>
 	<body>
 		<a href="/path/one">
 			<span>Boot.dev</span>
 		</a>
 		<a href="https://other.com/path/one">
 			<span>Boot.dev</span>
 		</a>
 	</body>
 </html>
--- a/internal/util/testdata/GetURLFromHTML/my-simple-cooking-website.html
+++ b/internal/util/testdata/GetURLFromHTML/my-simple-cooking-website.html
@ -0,0 +1,37 @@
 <!DOCTYPE html>
 <html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
 <head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
  <title>My simple cooking website</title>
  <style>
    code{white-space: pre-wrap;}
    span.smallcaps{font-variant: small-caps;}
    span.underline{text-decoration: underline;}
    div.column{display: inline-block; vertical-align: top; width: 50%;}
    div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
    ul.task-list{list-style: none;}
  </style>
 </head>
 <body>
 <header id="title-block-header">
 <h1 class="title">My simple cooking website</h1>
 </header>
 <p>Find my favourite recipes here.</p>
 <h2 id="recipes">Recipes</h2>
 <ul>
 <li><a href="/recipes/sweet-n-sour-kung-pao-style-chicken">Sweet 'n' Sour Kung Pao-Style Chicken</a></li>
 <li><a href="/recipes/beef-and-broccoli">Beef and Broccoli</a></li>
 <li><a href="/recipes/asian-glazed-salmon">Asian Glazed Salmon</a></li>
 <li><a href="/recipes/caesar-salad">Caesar Salad</a></li>
 <li><a href="/recipes/simple-tuna-salad">Simple Tuna Salad</a></li>
 <li><a href="/recipes/wholemeal-pizza">Wholemeal Pizza</a></li>
 </ul>
 <h2 id="links">Links</h2>
 <ul>
 <li><a href="/news">News</a></li>
 <li><a href="/about/contact">Contact</a></li>
 <li><a href="https://the-other-site.example.new/home">The other site</a></li>
 </ul>
 </body>
 </html>
--- a/internal/util/url.go
+++ b/internal/util/url.go
@ -0,0 +1,18 @@
 package util
 import (
 	"fmt"
 	"net/url"
 	"strings"
 )
 func NormaliseURL(rawURL string) (string, error) {
 	const normalisedFormat string = "%s%s"
 	parsedURL, err := url.Parse(rawURL)
 	if err != nil {
 		return "", fmt.Errorf("error parsing the URL %q: %w", rawURL, err)
 	}
 	return fmt.Sprintf(normalisedFormat, parsedURL.Hostname(), strings.TrimSuffix(parsedURL.Path, "/")), nil
 }
--- a/internal/util/url_test.go
+++ b/internal/util/url_test.go
@ -0,0 +1,81 @@
 package util_test
 import (
 	"slices"
 	"testing"
 	"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
 )
 func TestNormaliseURL(t *testing.T) {
 	t.Parallel()
 	wantNormalisedURL := "blog.boot.dev/path"
 	cases := []struct {
 		name     string
 		inputURL string
 	}{
 		{
 			name:     "remove HTTPS scheme",
 			inputURL: "https://blog.boot.dev/path",
 		},
 		{
 			name:     "remove HTTP scheme",
 			inputURL: "http://blog.boot.dev/path",
 		},
 		{
 			name:     "remove HTTPS scheme with a trailing slash",
 			inputURL: "https://blog.boot.dev/path/",
 		},
 		{
 			name:     "remove HTTP scheme with a trailing slash",
 			inputURL: "http://blog.boot.dev/path/",
 		},
 		{
 			name:     "remove HTTPS scheme with port 443",
 			inputURL: "https://blog.boot.dev:443/path",
 		},
 		{
 			name:     "remove HTTP scheme with port 80",
 			inputURL: "http://blog.boot.dev:80/path",
 		},
 		{
 			name:     "normalised URL",
 			inputURL: "blog.boot.dev/path",
 		},
 	}
 	for ind, tc := range slices.All(cases) {
 		t.Run(tc.name, func(t *testing.T) {
 			t.Parallel()
 			got, err := util.NormaliseURL(tc.inputURL)
 			if err != nil {
 				t.Fatalf(
 					"Test %d - '%s' FAILED: unexpected error: %v",
 					ind,
 					tc.name,
 					err,
 				)
 			}
 			if got != wantNormalisedURL {
 				t.Errorf(
 					"Test %d - %s FAILED: unexpected normalised URL returned: want %s, got %s",
 					ind,
 					tc.name,
 					wantNormalisedURL,
 					got,
 				)
 			} else {
 				t.Logf(
 					"Test %d - %s PASSED: expected normalised URL returned: got %s",
 					ind,
 					tc.name,
 					got,
 				)
 			}
 		})
 	}
 }
--- a/magefiles/go.mod
+++ b/magefiles/go.mod
@ -0,0 +1,5 @@
 module codeflow.dananglin.me.uk/apollo/web-crawler/magefiles
 go 1.23.0
 require github.com/magefile/mage v1.15.0
--- a/magefiles/go.sum
+++ b/magefiles/go.sum
@ -0,0 +1,2 @@
 github.com/magefile/mage v1.15.0 h1:BvGheCMAsG3bWUDbZ8AyXXpCNwU9u5CB6sM+HNb9HYg=
 github.com/magefile/mage v1.15.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A=
--- a/magefiles/mage.go
+++ b/magefiles/mage.go
@ -14,23 +14,23 @@ import (
 )
 const (
-	app                  = "binary"
+	app                  = "crawler"
 	defaultInstallPrefix = "/usr/local"
-	envInstallPrefix     = "PROJECT_INSTALL_PREFIX"
+	envInstallPrefix     = "CRAWLER_INSTALL_PREFIX"
-	envTestVerbose       = "PROJECT_TEST_VERBOSE"
+	envTestVerbose       = "CRAWLER_TEST_VERBOSE"
-	envTestCover         = "PROJECT_TEST_COVER"
+	envTestCover         = "CRAWLER_TEST_COVER"
-	envBuildRebuildAll   = "PROJECT_BUILD_REBUILD_ALL"
+	envBuildRebuildAll   = "CRAWLER_BUILD_REBUILD_ALL"
-	envBuildVerbose      = "PROJECT_BUILD_VERBOSE"
+	envBuildVerbose      = "CRAWLER_BUILD_VERBOSE"
 )
 var (
 	Default = Build
-	binary  = "./__build/" + app
+	binary  = app
 )
 // Test run the go tests.
-// To enable verbose mode set PROJECT_TEST_VERBOSE=1.
+// To enable verbose mode set CRAWLER_TEST_VERBOSE=1.
-// To enable coverage mode set PROJECT_TEST_COVER=1.
+// To enable coverage mode set CRAWLER_TEST_COVER=1.
 func Test() error {
 	goTest := sh.RunCmd("go", "test")
@ -56,10 +56,10 @@ func Lint() error {
 // To rebuild packages that are already up-to-date set PROJECT_BUILD_REBUILD_ALL=1
 // To enable verbose mode set PROJECT_BUILD_VERBOSE=1
 func Build() error {
-	main := "main.go"
+	main := "."
-	flags := ldflags()
+	//flags := ldflags()
 	build := sh.RunCmd("go", "build")
-	args := []string{"-ldflags=" + flags, "-o", binary}
+	args := []string{"-ldflags=-s -w", "-o", binary}
 	if os.Getenv(envBuildRebuildAll) == "1" {
 		args = append(args, "-a")
--- a/main.go
+++ b/main.go
@ -3,22 +3,48 @@ package main
 import (
 	"fmt"
 	"os"
-)
+	"strconv"
-var (
+	"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
 	binaryVersion string
 	buildTime     string
 	goVersion     string
 	gitCommit     string
 )
 func main() {
 	if err := run(); err != nil {
-		fmt.Printf("ERROR: %v.\n", err)
+		os.Stderr.WriteString("ERROR: " + err.Error() + "\n")
 		os.Exit(1)
 	}
 }
 func run() error {
 	args := os.Args[1:]
 	if len(args) != 3 {
 		return fmt.Errorf("unexpected number of arguments received: want 3, got %d", len(args))
 	}
 	baseURL := args[0]
 	maxConcurrency, err := strconv.Atoi(args[1])
 	if err != nil {
 		return fmt.Errorf("unable to convert the max concurrency (%s) to an integer: %w", args[1], err)
 	}
 	maxPages, err := strconv.Atoi(args[2])
 	if err != nil {
 		return fmt.Errorf("unable to convert the max pages (%s) to an integer: %w", args[2], err)
 	}
 	c, err := crawler.NewCrawler(baseURL, maxConcurrency, maxPages)
 	if err != nil {
 		return fmt.Errorf("unable to create the crawler: %w", err)
 	}
 	go c.Crawl(baseURL)
 	c.Wait()
 	c.PrintReport()
 	return nil
 }
		`@ -0,0 +1,2 @@`
							`golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE=`
							`golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg=`
		`@ -0,0 +1,2 @@`
							`github.com/magefile/mage v1.15.0 h1:BvGheCMAsG3bWUDbZ8AyXXpCNwU9u5CB6sM+HNb9HYg=`
							`github.com/magefile/mage v1.15.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A=`