feat: add the web crawler
All checks were successful
Tests / test (pull_request) Successful in 13s

Add the source code for the web crawler. The web crawler is a simple Go
CLI application that traverses through a website and generates a report
of all the internal links found in the site.
This commit is contained in:
Dan Anglin 2024-08-27 15:42:26 +01:00
parent 5d447923b1
commit 4519de764e
Signed by: dananglin
GPG key ID: 0C1D44CFBEE68638
25 changed files with 935 additions and 60 deletions

View file

@ -0,0 +1,6 @@
# syntax=docker/dockerfile:1
FROM golang:1.23.0
RUN go install github.com/magefile/mage@v1.15.0
ENTRYPOINT ["mage"]

View file

@ -0,0 +1,16 @@
---
name: "Mage Action"
description: "Runs a mage target in the defined in the project's repository"
inputs:
target:
description: "The mage target to run"
required: true
runs:
using: "docker"
image: "Dockerfile"
entrypoint: "mage"
args:
- -v
- ${{ inputs.target }}

View file

@ -0,0 +1,23 @@
---
name: Tests
on:
pull_request:
types:
- opened
- synchronize
jobs:
test:
if: ${{ ! github.event.pull_request.draft }}
runs-on: docker
steps:
- name: Checkout Repository
uses: https://code.forgejo.org/actions/checkout@v4
- name: Test
uses: ./.forgejo/actions/mage
with:
target: test
env:
CRAWLER_TEST_COVER: "1"
CRAWLER_TEST_VERBOSE: "1"

View file

@ -1,37 +0,0 @@
---
on:
pull_request:
types:
- opened
- reopened
- synchronize
jobs:
test:
runs-on: docker
env:
GO_TEST_VERBOSE: "1"
GO_TEST_COVER: "1"
steps:
- name: Checkout Repository
uses: https://code.forgejo.org/actions/checkout@v4
- name: Setup Go
uses: https://code.forgejo.org/actions/setup-go@v5
with:
go-version: '1.22'
- name: Test
run: go run magefiles/main.go -v test
lint:
runs-on: docker
steps:
- name: Checkout Repository
uses: https://code.forgejo.org/actions/checkout@v4
- name: Setup Go
uses: https://code.forgejo.org/actions/setup-go@v5
with:
go-version: '1.22'
- name: Lint
uses: https://github.com/golangci/golangci-lint-action@v3
with:
version: v1.54

3
.gitignore vendored
View file

@ -1,2 +1 @@
/__build/* /crawler
!__build/.gitkeep

View file

@ -6,7 +6,7 @@ run:
tests: true tests: true
output: output:
format: colored-line-number formats: colored-line-number
print-issues-lines: true print-issues-lines: true
print-linter-name: true print-linter-name: true
uniq-by-line: true uniq-by-line: true
@ -18,5 +18,7 @@ linters-settings:
linters: linters:
enable-all: true enable-all: true
# disable: disable:
- execinquery
- gomnd
fast: false fast: false

21
LICENSE Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Dan Anglin
Permission is hereby granted, free of charge, to any person obtaining a copy of this
software and associated documentation files (the “Software”), to deal in the Software
without restriction, including without limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or
substantial portions of the Software.
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

5
go.mod Normal file
View file

@ -0,0 +1,5 @@
module codeflow.dananglin.me.uk/apollo/web-crawler
go 1.23.0
require golang.org/x/net v0.28.0

2
go.sum Normal file
View file

@ -0,0 +1,2 @@
golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE=
golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg=

168
internal/crawler/crawler.go Normal file
View file

@ -0,0 +1,168 @@
package crawler
import (
"fmt"
"net/url"
"os"
"sync"
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/report"
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
)
type Crawler struct {
pages map[string]int
baseURL *url.URL
mu *sync.Mutex
concurrencyControl chan struct{}
wg *sync.WaitGroup
maxPages int
}
func NewCrawler(rawBaseURL string, maxConcurrency, maxPages int) (*Crawler, error) {
baseURL, err := url.Parse(rawBaseURL)
if err != nil {
return nil, fmt.Errorf("unable to parse the base URL: %w", err)
}
var waitGroup sync.WaitGroup
waitGroup.Add(1)
crawler := Crawler{
pages: make(map[string]int),
baseURL: baseURL,
mu: &sync.Mutex{},
concurrencyControl: make(chan struct{}, maxConcurrency),
wg: &waitGroup,
maxPages: maxPages,
}
return &crawler, nil
}
func (c *Crawler) Crawl(rawCurrentURL string) {
// Add an empty struct to channel here
c.concurrencyControl <- struct{}{}
// Decrement the wait group counter and free up the channel when finished
// crawling.
defer func() {
<-c.concurrencyControl
c.wg.Done()
}()
if c.reachedMaxPages() {
return
}
// if current URL is not on the same domain as the base URL then return early.
hasEqualDomain, err := c.HasEqualDomain(rawCurrentURL)
if err != nil {
fmt.Printf(
"WARNING: Unable to determine if %q has the same domain as %q; %v.\n",
rawCurrentURL,
c.baseURL.Hostname(),
err,
)
return
}
if !hasEqualDomain {
return
}
// get normalised version of rawCurrentURL
normalisedCurrentURL, err := util.NormaliseURL(rawCurrentURL)
if err != nil {
fmt.Printf("WARNING: Error normalising %q: %v.\n", rawCurrentURL, err)
return
}
// Add (or update) a record of the URL in the pages map.
// If there's already an entry of the URL in the map then return early.
if existed := c.AddPageVisit(normalisedCurrentURL); existed {
return
}
// Get the HTML from the current URL, print that you are getting the HTML doc from current URL.
fmt.Printf("Crawling %q\n", rawCurrentURL)
htmlDoc, err := getHTML(rawCurrentURL)
if err != nil {
fmt.Printf(
"WARNING: Error retrieving the HTML document from %q: %v.\n",
rawCurrentURL,
err,
)
return
}
// Get all the URLs from the HTML doc.
links, err := util.GetURLsFromHTML(htmlDoc, c.baseURL.String())
if err != nil {
fmt.Printf(
"WARNING: Error retrieving the links from the HTML document: %v.\n",
err,
)
return
}
// Recursively crawl each URL on the page.
for ind := range len(links) {
c.wg.Add(1)
go c.Crawl(links[ind])
}
}
func (c *Crawler) HasEqualDomain(rawURL string) (bool, error) {
parsedRawURL, err := url.Parse(rawURL)
if err != nil {
return false, fmt.Errorf("error parsing the URL %q: %w", rawURL, err)
}
return c.baseURL.Hostname() == parsedRawURL.Hostname(), nil
}
// addPageVisit adds a record of the visited page's URL to the pages map.
// If there is already a record of the URL then it's record is updated (incremented)
// and the method returns true. If the URL is not already recorded then it is created
// and the method returns false.
func (c *Crawler) AddPageVisit(normalisedURL string) bool {
c.mu.Lock()
defer c.mu.Unlock()
_, exists := c.pages[normalisedURL]
if exists {
c.pages[normalisedURL]++
} else {
c.pages[normalisedURL] = 1
}
return exists
}
func (c *Crawler) Wait() {
c.wg.Wait()
}
func (c *Crawler) PrintReport() {
c.mu.Lock()
defer c.mu.Unlock()
r := report.NewReport(c.baseURL.String(), c.pages)
fmt.Fprint(os.Stdout, r)
}
func (c *Crawler) reachedMaxPages() bool {
c.mu.Lock()
defer c.mu.Unlock()
return len(c.pages) >= c.maxPages
}

View file

@ -0,0 +1,172 @@
package crawler_test
import (
"fmt"
"slices"
"testing"
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
)
func TestCrawler(t *testing.T) {
testBaseURL := "https://example.com"
testCrawler, err := crawler.NewCrawler(testBaseURL, 1, 10)
if err != nil {
t.Fatalf("Test 'TestCrawler' FAILED: unexpected error creating the crawler: %v", err)
}
testCasesForEqualDomains := []struct {
name string
rawURL string
want bool
}{
{
name: "Same domain",
rawURL: "https://example.com",
want: true,
},
{
name: "Same domain, different path",
rawURL: "https://example.com/about/contact",
want: true,
},
{
name: "Same domain, different protocol",
rawURL: "http://example.com",
want: true,
},
{
name: "Different domain",
rawURL: "https://blog.person.me.uk",
want: false,
},
{
name: "Different domain, same path",
rawURL: "https://example.org/blog",
want: false,
},
}
for ind, tc := range slices.All(testCasesForEqualDomains) {
t.Run(tc.name, testHasEqualDomains(
testCrawler,
ind+1,
tc.name,
tc.rawURL,
tc.want,
))
}
testCasesForPages := []struct {
rawURL string
wantVisited bool
}{
{
rawURL: "https://example.com/tags/linux",
wantVisited: false,
},
{
rawURL: "https://example.com/blog",
wantVisited: false,
},
{
rawURL: "https://example.com/about/contact.html",
wantVisited: false,
},
{
rawURL: "https://example.com/blog",
wantVisited: true,
},
}
for ind, tc := range slices.All(testCasesForPages) {
name := fmt.Sprintf("Adding %s to the pages map", tc.rawURL)
t.Run(name, testAddPageVisit(
testCrawler,
ind+1,
name,
tc.rawURL,
tc.wantVisited,
))
}
}
func testHasEqualDomains(
testCrawler *crawler.Crawler,
testNum int,
testName string,
rawURL string,
want bool,
) func(t *testing.T) {
return func(t *testing.T) {
t.Parallel()
got, err := testCrawler.HasEqualDomain(rawURL)
if err != nil {
t.Fatalf(
"Test %d - '%s' FAILED: unexpected error: %v",
testNum,
testName,
err,
)
}
if got != want {
t.Errorf(
"Test %d - '%s' FAILED: unexpected domain comparison received: want %t, got %t",
testNum,
testName,
want,
got,
)
} else {
t.Logf(
"Test %d - '%s' PASSED: expected domain comparison received: got %t",
testNum,
testName,
got,
)
}
}
}
func testAddPageVisit(
testCrawler *crawler.Crawler,
testNum int,
testName string,
rawURL string,
wantVisited bool,
) func(t *testing.T) {
return func(t *testing.T) {
normalisedURL, err := util.NormaliseURL(rawURL)
if err != nil {
t.Fatalf(
"Test %d - '%s' FAILED: unexpected error: %v",
testNum,
testName,
err,
)
}
gotVisited := testCrawler.AddPageVisit(normalisedURL)
if gotVisited != wantVisited {
t.Errorf(
"Test %d - '%s' FAILED: unexpected bool returned after updated pages record: want %t, got %t",
testNum,
testName,
wantVisited,
gotVisited,
)
} else {
t.Logf(
"Test %d - '%s' PASSED: expected bool returned after updated pages record: got %t",
testNum,
testName,
gotVisited,
)
}
}
}

View file

@ -0,0 +1,50 @@
package crawler
import (
"context"
"fmt"
"io"
"net/http"
"strings"
"time"
)
func getHTML(rawURL string) (string, error) {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(10*time.Second))
defer cancel()
request, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
if err != nil {
return "", fmt.Errorf("error creating the HTTP request: %w", err)
}
client := http.Client{}
resp, err := client.Do(request)
if err != nil {
return "", fmt.Errorf("error getting the response: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
return "", fmt.Errorf(
"received a bad status from %s: (%d) %s",
rawURL,
resp.StatusCode,
resp.Status,
)
}
contentType := resp.Header.Get("content-type")
if !strings.Contains(contentType, "text/html") {
return "", fmt.Errorf("unexpected content type received: want text/html, got %s", contentType)
}
data, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("error reading the data from the response: %w", err)
}
return string(data), nil
}

66
internal/report/report.go Normal file
View file

@ -0,0 +1,66 @@
package report
import (
"cmp"
"maps"
"slices"
"strconv"
"strings"
)
type Report struct {
baseURL string
records []Record
}
type Record struct {
link string
count int
}
func NewReport(baseURL string, pages map[string]int) Report {
records := make([]Record, 0)
for link, count := range maps.All(pages) {
records = append(records, Record{link: link, count: count})
}
report := Report{
baseURL: baseURL,
records: records,
}
report.SortRecords()
return report
}
func (r *Report) SortRecords() {
// First sort records by count (in reverse order hopefully)
// Then sort records by name if two elements have the same count.
slices.SortFunc(r.records, func(a, b Record) int {
if n := cmp.Compare(a.count, b.count); n != 0 {
return -1 * n
}
return strings.Compare(a.link, b.link)
})
}
func (r Report) String() string {
var builder strings.Builder
titlebar := strings.Repeat("\u2500", 80)
builder.WriteString("\n" + titlebar)
builder.WriteString("\n" + "REPORT for " + r.baseURL)
builder.WriteString("\n" + titlebar)
for ind := range slices.All(r.records) {
builder.WriteString("\nFound " + strconv.Itoa(r.records[ind].count) + " internal links to " + r.records[ind].link)
}
builder.WriteString("\n")
return builder.String()
}

70
internal/util/html.go Normal file
View file

@ -0,0 +1,70 @@
package util
import (
"fmt"
"net/url"
"strings"
"golang.org/x/net/html"
)
func GetURLsFromHTML(htmlBody, rawBaseURL string) ([]string, error) {
htmlDoc, err := html.Parse(strings.NewReader(htmlBody))
if err != nil {
return []string{}, fmt.Errorf("unable to parse the HTML document: %w", err)
}
parsedRawBaseURL, err := url.Parse(rawBaseURL)
if err != nil {
return []string{}, fmt.Errorf("unable to parse the raw base URL %q: %w", rawBaseURL, err)
}
output := make([]string, 0, 3)
var extractLinkFunc func(*html.Node) error
extractLinkFunc = func(node *html.Node) error {
if node.Type == html.ElementNode && node.Data == "a" {
for _, a := range node.Attr {
if a.Key == "href" {
extractedURL, err := getAbsoluteURL(a.Val, parsedRawBaseURL)
if err != nil {
return fmt.Errorf("unable to get the absolute URL of %s: %w", a.Val, err)
}
output = append(output, extractedURL)
break
}
}
}
for c := node.FirstChild; c != nil; c = c.NextSibling {
if err := extractLinkFunc(c); err != nil {
return err
}
}
return nil
}
if err := extractLinkFunc(htmlDoc); err != nil {
return []string{}, err
}
return output, nil
}
func getAbsoluteURL(inputURL string, baseURL *url.URL) (string, error) {
parsedURL, err := url.Parse(inputURL)
if err != nil {
return "", fmt.Errorf("unable to parse the URL from %s: %w", inputURL, err)
}
if parsedURL.Scheme == "" && parsedURL.Host == "" {
parsedURL.Scheme = baseURL.Scheme
parsedURL.Host = baseURL.Host
}
return parsedURL.String(), nil
}

View file

@ -0,0 +1,99 @@
package util_test
import (
"os"
"reflect"
"slices"
"testing"
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
)
func TestGetURLsFromHTML(t *testing.T) {
t.Parallel()
cases := []struct {
name string
filepath string
baseURL string
want []string
}{
{
name: "HTML documentation using blog.boot.dev",
filepath: "testdata/GetURLFromHTML/blog.boot.dev.html",
baseURL: "https://blog.boot.dev",
want: []string{
"https://blog.boot.dev/path/one",
"https://other.com/path/one",
},
},
{
name: "HTML documentation using https://ben-bartlett.me.uk",
filepath: "testdata/GetURLFromHTML/ben-bartlett.html",
baseURL: "https://ben-bartlett.me.uk",
want: []string{
"https://ben-bartlett.me.uk",
"https://github.com/ben-bartlett",
"https://mastodon.ben-bartlett.me.uk",
"https://ben-bartlett.me.uk/blog",
"https://ben-bartlett.me.uk/projects/orange-juice",
"https://ben-bartlett.me.uk/projects/mustangs",
"https://ben-bartlett.me.uk/projects/honeycombs",
},
},
{
name: "HTML documentation using https://simple.cooking",
filepath: "testdata/GetURLFromHTML/my-simple-cooking-website.html",
baseURL: "https://simple.cooking",
want: []string{
"https://simple.cooking/recipes/sweet-n-sour-kung-pao-style-chicken",
"https://simple.cooking/recipes/beef-and-broccoli",
"https://simple.cooking/recipes/asian-glazed-salmon",
"https://simple.cooking/recipes/caesar-salad",
"https://simple.cooking/recipes/simple-tuna-salad",
"https://simple.cooking/recipes/wholemeal-pizza",
"https://simple.cooking/news",
"https://simple.cooking/about/contact",
"https://the-other-site.example.new/home",
},
},
}
for _, tc := range slices.All(cases) {
t.Run(tc.name, testGetURLsFromHTML(tc.filepath, tc.baseURL, tc.want))
}
}
func testGetURLsFromHTML(path, baseURL string, want []string) func(t *testing.T) {
failedTestPrefix := "Test TestGetURLsFromHTML FAILED:"
return func(t *testing.T) {
t.Parallel()
htmlDoc, err := os.ReadFile(path)
if err != nil {
t.Fatalf("%s unable to open read data from %s: %v", failedTestPrefix, path, err)
}
got, err := util.GetURLsFromHTML(string(htmlDoc), baseURL)
if err != nil {
t.Fatalf(
"Test TestGetURLsFromHTML FAILED: unexpected error: %v",
err,
)
}
if !reflect.DeepEqual(want, got) {
t.Errorf(
"Test TestGetURLsFromHTML FAILED: unexpected URLs found in HTML body: want %v, got %v",
want,
got,
)
} else {
t.Logf(
"Test TestGetURLsFromHTML PASSED: expected URLs found in HTML body: got %v",
got,
)
}
}
}

View file

@ -0,0 +1,34 @@
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<head>
<meta charset="utf-8" />
<title>Ben Bartlett</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
span.underline{text-decoration: underline;}
div.column{display: inline-block; vertical-align: top; width: 50%;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
</style>
</head>
<body>
<header id="title-block-header">
<h1 class="title">Ben Bartlett</h1>
</header>
<p>Hey there! Ben Bartlett here. I am a Backend software engineer working in the healthcare industry. At night I am a hobbyist developer of 2D games. When I'm not coding I would find myself cooking, reading engaging novels, and going on the occasional hike or two.</p>
<h2 id="my-links">My Links</h2>
<ul>
<li><a href="https://ben-bartlett.me.uk">My website</a></li>
<li><a href="https://github.com/ben-bartlett">GitHub</a></li>
<li><a href="https://mastodon.ben-bartlett.me.uk">Mastodon</a></li>
<li><a href="/blog">My blog</a></li>
</ul>
<h2 id="projects-im-working-on">Projects I'm working on</h2>
<ul>
<li><a href="/projects/orange-juice">Orange Juice</a></li>
<li><a href="/projects/mustangs">Mustangs</a></li>
<li><a href="/projects/honeycombs">Honeycombs</a></li>
</ul>
</body>
</html>

View file

@ -0,0 +1,10 @@
<html>
<body>
<a href="/path/one">
<span>Boot.dev</span>
</a>
<a href="https://other.com/path/one">
<span>Boot.dev</span>
</a>
</body>
</html>

View file

@ -0,0 +1,37 @@
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
<title>My simple cooking website</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
span.underline{text-decoration: underline;}
div.column{display: inline-block; vertical-align: top; width: 50%;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
</style>
</head>
<body>
<header id="title-block-header">
<h1 class="title">My simple cooking website</h1>
</header>
<p>Find my favourite recipes here.</p>
<h2 id="recipes">Recipes</h2>
<ul>
<li><a href="/recipes/sweet-n-sour-kung-pao-style-chicken">Sweet 'n' Sour Kung Pao-Style Chicken</a></li>
<li><a href="/recipes/beef-and-broccoli">Beef and Broccoli</a></li>
<li><a href="/recipes/asian-glazed-salmon">Asian Glazed Salmon</a></li>
<li><a href="/recipes/caesar-salad">Caesar Salad</a></li>
<li><a href="/recipes/simple-tuna-salad">Simple Tuna Salad</a></li>
<li><a href="/recipes/wholemeal-pizza">Wholemeal Pizza</a></li>
</ul>
<h2 id="links">Links</h2>
<ul>
<li><a href="/news">News</a></li>
<li><a href="/about/contact">Contact</a></li>
<li><a href="https://the-other-site.example.new/home">The other site</a></li>
</ul>
</body>
</html>

18
internal/util/url.go Normal file
View file

@ -0,0 +1,18 @@
package util
import (
"fmt"
"net/url"
"strings"
)
func NormaliseURL(rawURL string) (string, error) {
const normalisedFormat string = "%s%s"
parsedURL, err := url.Parse(rawURL)
if err != nil {
return "", fmt.Errorf("error parsing the URL %q: %w", rawURL, err)
}
return fmt.Sprintf(normalisedFormat, parsedURL.Hostname(), strings.TrimSuffix(parsedURL.Path, "/")), nil
}

81
internal/util/url_test.go Normal file
View file

@ -0,0 +1,81 @@
package util_test
import (
"slices"
"testing"
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
)
func TestNormaliseURL(t *testing.T) {
t.Parallel()
wantNormalisedURL := "blog.boot.dev/path"
cases := []struct {
name string
inputURL string
}{
{
name: "remove HTTPS scheme",
inputURL: "https://blog.boot.dev/path",
},
{
name: "remove HTTP scheme",
inputURL: "http://blog.boot.dev/path",
},
{
name: "remove HTTPS scheme with a trailing slash",
inputURL: "https://blog.boot.dev/path/",
},
{
name: "remove HTTP scheme with a trailing slash",
inputURL: "http://blog.boot.dev/path/",
},
{
name: "remove HTTPS scheme with port 443",
inputURL: "https://blog.boot.dev:443/path",
},
{
name: "remove HTTP scheme with port 80",
inputURL: "http://blog.boot.dev:80/path",
},
{
name: "normalised URL",
inputURL: "blog.boot.dev/path",
},
}
for ind, tc := range slices.All(cases) {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
got, err := util.NormaliseURL(tc.inputURL)
if err != nil {
t.Fatalf(
"Test %d - '%s' FAILED: unexpected error: %v",
ind,
tc.name,
err,
)
}
if got != wantNormalisedURL {
t.Errorf(
"Test %d - %s FAILED: unexpected normalised URL returned: want %s, got %s",
ind,
tc.name,
wantNormalisedURL,
got,
)
} else {
t.Logf(
"Test %d - %s PASSED: expected normalised URL returned: got %s",
ind,
tc.name,
got,
)
}
})
}
}

5
magefiles/go.mod Normal file
View file

@ -0,0 +1,5 @@
module codeflow.dananglin.me.uk/apollo/web-crawler/magefiles
go 1.23.0
require github.com/magefile/mage v1.15.0

2
magefiles/go.sum Normal file
View file

@ -0,0 +1,2 @@
github.com/magefile/mage v1.15.0 h1:BvGheCMAsG3bWUDbZ8AyXXpCNwU9u5CB6sM+HNb9HYg=
github.com/magefile/mage v1.15.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A=

View file

@ -14,23 +14,23 @@ import (
) )
const ( const (
app = "binary" app = "crawler"
defaultInstallPrefix = "/usr/local" defaultInstallPrefix = "/usr/local"
envInstallPrefix = "PROJECT_INSTALL_PREFIX" envInstallPrefix = "CRAWLER_INSTALL_PREFIX"
envTestVerbose = "PROJECT_TEST_VERBOSE" envTestVerbose = "CRAWLER_TEST_VERBOSE"
envTestCover = "PROJECT_TEST_COVER" envTestCover = "CRAWLER_TEST_COVER"
envBuildRebuildAll = "PROJECT_BUILD_REBUILD_ALL" envBuildRebuildAll = "CRAWLER_BUILD_REBUILD_ALL"
envBuildVerbose = "PROJECT_BUILD_VERBOSE" envBuildVerbose = "CRAWLER_BUILD_VERBOSE"
) )
var ( var (
Default = Build Default = Build
binary = "./__build/" + app binary = app
) )
// Test run the go tests. // Test run the go tests.
// To enable verbose mode set PROJECT_TEST_VERBOSE=1. // To enable verbose mode set CRAWLER_TEST_VERBOSE=1.
// To enable coverage mode set PROJECT_TEST_COVER=1. // To enable coverage mode set CRAWLER_TEST_COVER=1.
func Test() error { func Test() error {
goTest := sh.RunCmd("go", "test") goTest := sh.RunCmd("go", "test")
@ -56,10 +56,10 @@ func Lint() error {
// To rebuild packages that are already up-to-date set PROJECT_BUILD_REBUILD_ALL=1 // To rebuild packages that are already up-to-date set PROJECT_BUILD_REBUILD_ALL=1
// To enable verbose mode set PROJECT_BUILD_VERBOSE=1 // To enable verbose mode set PROJECT_BUILD_VERBOSE=1
func Build() error { func Build() error {
main := "main.go" main := "."
flags := ldflags() //flags := ldflags()
build := sh.RunCmd("go", "build") build := sh.RunCmd("go", "build")
args := []string{"-ldflags=" + flags, "-o", binary} args := []string{"-ldflags=-s -w", "-o", binary}
if os.Getenv(envBuildRebuildAll) == "1" { if os.Getenv(envBuildRebuildAll) == "1" {
args = append(args, "-a") args = append(args, "-a")

40
main.go
View file

@ -3,22 +3,48 @@ package main
import ( import (
"fmt" "fmt"
"os" "os"
) "strconv"
var ( "codeflow.dananglin.me.uk/apollo/web-crawler/internal/crawler"
binaryVersion string
buildTime string
goVersion string
gitCommit string
) )
func main() { func main() {
if err := run(); err != nil { if err := run(); err != nil {
fmt.Printf("ERROR: %v.\n", err) os.Stderr.WriteString("ERROR: " + err.Error() + "\n")
os.Exit(1) os.Exit(1)
} }
} }
func run() error { func run() error {
args := os.Args[1:]
if len(args) != 3 {
return fmt.Errorf("unexpected number of arguments received: want 3, got %d", len(args))
}
baseURL := args[0]
maxConcurrency, err := strconv.Atoi(args[1])
if err != nil {
return fmt.Errorf("unable to convert the max concurrency (%s) to an integer: %w", args[1], err)
}
maxPages, err := strconv.Atoi(args[2])
if err != nil {
return fmt.Errorf("unable to convert the max pages (%s) to an integer: %w", args[2], err)
}
c, err := crawler.NewCrawler(baseURL, maxConcurrency, maxPages)
if err != nil {
return fmt.Errorf("unable to create the crawler: %w", err)
}
go c.Crawl(baseURL)
c.Wait()
c.PrintReport()
return nil return nil
} }