wip

2024-08-26 18:37:45 +01:00 · 2024-08-26 18:37:45 +01:00 · 235132d0cc
commit 235132d0cc
parent 5d447923b1
16 changed files with 390 additions and 15 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1 @@
-/__build/*
-!__build/.gitkeep
+crawler
--- a/.golangci.yaml
+++ b/.golangci.yaml
@ -6,7 +6,7 @@ run:
  tests: true

 output:
-  format: colored-line-number
+  formats: colored-line-number
  print-issues-lines: true
  print-linter-name: true
  uniq-by-line: true
@ -18,5 +18,7 @@ linters-settings:

 linters:
  enable-all: true
-  # disable:
+  disable:
+  - execinquery
+  - gomnd
  fast: false
--- a/__build/.gitkeep
+++ b/__build/.gitkeep
--- a/get_urls_from_html.go
+++ b/get_urls_from_html.go
@ -0,0 +1,70 @@
+package main
+
+import (
+	"fmt"
+	"net/url"
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+func getURLsFromHTML(htmlBody, rawBaseURL string) ([]string, error) {
+	htmlDoc, err := html.Parse(strings.NewReader(htmlBody))
+	if err != nil {
+		return []string{}, fmt.Errorf("unable to parse the HTML document: %w", err)
+	}
+
+	parsedRawBaseURL, err := url.Parse(rawBaseURL)
+	if err != nil {
+		return []string{}, fmt.Errorf("unable to parse the raw base URL %q: %w", rawBaseURL, err)
+	}
+
+	output := make([]string, 0, 3)
+
+	var extractLinkFunc func(*html.Node) error
+
+	extractLinkFunc = func(node *html.Node) error {
+		if node.Type == html.ElementNode && node.Data == "a" {
+			for _, a := range node.Attr {
+				if a.Key == "href" {
+					extractedURL, err := getAbsoluteURL(a.Val, parsedRawBaseURL)
+					if err != nil {
+						return fmt.Errorf("unable to get the absolute URL of %s: %w", a.Val, err)
+					}
+
+					output = append(output, extractedURL)
+
+					break
+				}
+			}
+		}
+
+		for c := node.FirstChild; c != nil; c = c.NextSibling {
+			if err := extractLinkFunc(c); err != nil {
+				return err
+			}
+		}
+
+		return nil
+	}
+
+	if err := extractLinkFunc(htmlDoc); err != nil {
+		return []string{}, err
+	}
+
+	return output, nil
+}
+
+func getAbsoluteURL(inputURL string, baseURL *url.URL) (string, error) {
+	parsedURL, err := url.Parse(inputURL)
+	if err != nil {
+		return "", fmt.Errorf("unable to parse the URL from %s: %w", inputURL, err)
+	}
+
+	if parsedURL.Scheme == "" && parsedURL.Host == "" {
+		parsedURL.Scheme = baseURL.Scheme
+		parsedURL.Host = baseURL.Host
+	}
+
+	return parsedURL.String(), nil
+}
--- a/get_urls_from_html_test.go
+++ b/get_urls_from_html_test.go
@ -0,0 +1,97 @@
+package main
+
+import (
+	"os"
+	"reflect"
+	"slices"
+	"testing"
+)
+
+func TestGetURLsFromHTML(t *testing.T) {
+	t.Parallel()
+
+	cases := []struct {
+		name     string
+		filepath string
+		baseURL  string
+		want     []string
+	}{
+		{
+			name:     "HTML documentation using blog.boot.dev",
+			filepath: "tests/GetURLFromHTML/blog.boot.dev.html",
+			baseURL:  "https://blog.boot.dev",
+			want: []string{
+				"https://blog.boot.dev/path/one",
+				"https://other.com/path/one",
+			},
+		},
+		{
+			name:     "HTML documentation using https://ben-bartlett.me.uk",
+			filepath: "tests/GetURLFromHTML/ben-bartlett.html",
+			baseURL:  "https://ben-bartlett.me.uk",
+			want: []string{
+				"https://ben-bartlett.me.uk",
+				"https://github.com/ben-bartlett",
+				"https://mastodon.ben-bartlett.me.uk",
+				"https://ben-bartlett.me.uk/blog",
+				"https://ben-bartlett.me.uk/projects/orange-juice",
+				"https://ben-bartlett.me.uk/projects/mustangs",
+				"https://ben-bartlett.me.uk/projects/honeycombs",
+			},
+		},
+		{
+			name:     "HTML documentation using https://simple.cooking",
+			filepath: "tests/GetURLFromHTML/my-simple-cooking-website.html",
+			baseURL:  "https://simple.cooking",
+			want: []string{
+				"https://simple.cooking/recipes/sweet-n-sour-kung-pao-style-chicken",
+				"https://simple.cooking/recipes/beef-and-broccoli",
+				"https://simple.cooking/recipes/asian-glazed-salmon",
+				"https://simple.cooking/recipes/caesar-salad",
+				"https://simple.cooking/recipes/simple-tuna-salad",
+				"https://simple.cooking/recipes/wholemeal-pizza",
+				"https://simple.cooking/news",
+				"https://simple.cooking/about/contact",
+				"https://the-other-site.example.new/home",
+			},
+		},
+	}
+
+	for _, tc := range slices.All(cases) {
+		t.Run(tc.name, testGetURLsFromHTML(tc.filepath, tc.baseURL, tc.want))
+	}
+}
+
+func testGetURLsFromHTML(path, baseURL string, want []string) func(t *testing.T) {
+	failedTestPrefix := "Test TestGetURLsFromHTML FAILED:"
+
+	return func(t *testing.T) {
+		t.Parallel()
+
+		htmlDoc, err := os.ReadFile(path)
+		if err != nil {
+			t.Fatalf("%s unable to open read data from %s: %v", failedTestPrefix, path, err)
+		}
+
+		got, err := getURLsFromHTML(string(htmlDoc), baseURL)
+		if err != nil {
+			t.Fatalf(
+				"Test TestGetURLsFromHTML FAILED: unexpected error: %v",
+				err,
+			)
+		}
+
+		if !reflect.DeepEqual(want, got) {
+			t.Errorf(
+				"Test TestGetURLsFromHTML FAILED: unexpected URLs found in HTML body: want %v, got %v",
+				want,
+				got,
+			)
+		} else {
+			t.Logf(
+				"Test TestGetURLsFromHTML PASSED: expected URLs found in HTML body: got %v",
+				got,
+			)
+		}
+	}
+}
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,5 @@
+module codeflow.dananglin.me.uk/apollo/web-crawler
+
+go 1.23.0
+
+require golang.org/x/net v0.28.0
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,2 @@
+golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE=
+golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg=
--- a/magefiles/go.mod
+++ b/magefiles/go.mod
@ -0,0 +1,5 @@
+module codeflow.dananglin.me.uk/apollo/web-crawler/magefiles
+
+go 1.23.0
+
+require github.com/magefile/mage v1.15.0
--- a/magefiles/go.sum
+++ b/magefiles/go.sum
@ -0,0 +1,2 @@
+github.com/magefile/mage v1.15.0 h1:BvGheCMAsG3bWUDbZ8AyXXpCNwU9u5CB6sM+HNb9HYg=
+github.com/magefile/mage v1.15.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A=
--- a/magefiles/mage.go
+++ b/magefiles/mage.go
@ -14,23 +14,23 @@ import (
 )

 const (
-	app                  = "binary"
+	app                  = "crawler"
 	defaultInstallPrefix = "/usr/local"
-	envInstallPrefix     = "PROJECT_INSTALL_PREFIX"
-	envTestVerbose       = "PROJECT_TEST_VERBOSE"
-	envTestCover         = "PROJECT_TEST_COVER"
-	envBuildRebuildAll   = "PROJECT_BUILD_REBUILD_ALL"
-	envBuildVerbose      = "PROJECT_BUILD_VERBOSE"
+	envInstallPrefix     = "CRAWLER_INSTALL_PREFIX"
+	envTestVerbose       = "CRAWLER_TEST_VERBOSE"
+	envTestCover         = "CRAWLER_TEST_COVER"
+	envBuildRebuildAll   = "CRAWLER_BUILD_REBUILD_ALL"
+	envBuildVerbose      = "CRAWLER_BUILD_VERBOSE"
 )

 var (
 	Default = Build
-	binary  = "./__build/" + app
+	binary  = app
 )

 // Test run the go tests.
-// To enable verbose mode set PROJECT_TEST_VERBOSE=1.
-// To enable coverage mode set PROJECT_TEST_COVER=1.
+// To enable verbose mode set CRAWLER_TEST_VERBOSE=1.
+// To enable coverage mode set CRAWLER_TEST_COVER=1.
 func Test() error {
 	goTest := sh.RunCmd("go", "test")

@ -56,7 +56,7 @@ func Lint() error {
 // To rebuild packages that are already up-to-date set PROJECT_BUILD_REBUILD_ALL=1
 // To enable verbose mode set PROJECT_BUILD_VERBOSE=1
 func Build() error {
-	main := "main.go"
+	main := "."
 	flags := ldflags()
 	build := sh.RunCmd("go", "build")
 	args := []string{"-ldflags=" + flags, "-o", binary}
--- a/main.go
+++ b/main.go
@ -1,6 +1,7 @@
 package main

 import (
+	"errors"
 	"fmt"
 	"os"
 )
@ -14,11 +15,25 @@ var (

 func main() {
 	if err := run(); err != nil {
-		fmt.Printf("ERROR: %v.\n", err)
+		fmt.Println(err)
 		os.Exit(1)
 	}
 }

 func run() error {
+	args := os.Args[1:]
+
+	if len(args) == 0 {
+		return errors.New("no website provided")
+	}
+
+	if len(args) > 1 {
+		return errors.New("too many arguments provided")
+	}
+
+	baseURL := args[0]
+
+	fmt.Printf("starting crawl of: %s\n", baseURL)
+
 	return nil
 }
--- a/normalise_url.go
+++ b/normalise_url.go
@ -0,0 +1,18 @@
+package main
+
+import (
+	"fmt"
+	"net/url"
+	"strings"
+)
+
+func normaliseURL(input string) (string, error) {
+	const normalisedFormat string = "%s%s"
+
+	parsedURL, err := url.Parse(input)
+	if err != nil {
+		return "", fmt.Errorf("error parsing the URL %q: %w", input, err)
+	}
+
+	return fmt.Sprintf(normalisedFormat, parsedURL.Hostname(), strings.TrimSuffix(parsedURL.Path, "/")), nil
+}
--- a/normalise_url_test.go
+++ b/normalise_url_test.go
@ -0,0 +1,79 @@
+package main
+
+import (
+	"slices"
+	"testing"
+)
+
+func TestNormaliseURL(t *testing.T) {
+	t.Parallel()
+
+	wantNormalisedURL := "blog.boot.dev/path"
+
+	cases := []struct {
+		name     string
+		inputURL string
+	}{
+		{
+			name:     "remove HTTPS scheme",
+			inputURL: "https://blog.boot.dev/path",
+		},
+		{
+			name:     "remove HTTP scheme",
+			inputURL: "http://blog.boot.dev/path",
+		},
+		{
+			name:     "remove HTTPS scheme with a trailing slash",
+			inputURL: "https://blog.boot.dev/path/",
+		},
+		{
+			name:     "remove HTTP scheme with a trailing slash",
+			inputURL: "http://blog.boot.dev/path/",
+		},
+		{
+			name:     "remove HTTPS scheme with port 443",
+			inputURL: "https://blog.boot.dev:443/path",
+		},
+		{
+			name:     "remove HTTP scheme with port 80",
+			inputURL: "http://blog.boot.dev:80/path",
+		},
+		{
+			name:     "normalised URL",
+			inputURL: "blog.boot.dev/path",
+		},
+	}
+
+	for ind, tc := range slices.All(cases) {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Parallel()
+
+			got, err := normaliseURL(tc.inputURL)
+			if err != nil {
+				t.Fatalf(
+					"Test %v - '%s' FAILED: unexpected error: %v",
+					ind,
+					tc.name,
+					err,
+				)
+			}
+
+			if got != wantNormalisedURL {
+				t.Errorf(
+					"Test %d - %s PASSED: unexpected normalised URL returned: want %s, got %s",
+					ind,
+					tc.name,
+					wantNormalisedURL,
+					got,
+				)
+			} else {
+				t.Logf(
+					"Test %d - %s PASSED: expected normalised URL returned: got %s",
+					ind,
+					tc.name,
+					got,
+				)
+			}
+		})
+	}
+}
--- a/tests/GetURLFromHTML/ben-bartlett.html
+++ b/tests/GetURLFromHTML/ben-bartlett.html
@ -0,0 +1,34 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
+<head>
+  <meta charset="utf-8" />
+  <title>Ben Bartlett</title>
+  <style>
+    code{white-space: pre-wrap;}
+    span.smallcaps{font-variant: small-caps;}
+    span.underline{text-decoration: underline;}
+    div.column{display: inline-block; vertical-align: top; width: 50%;}
+    div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+    ul.task-list{list-style: none;}
+  </style>
+</head>
+<body>
+<header id="title-block-header">
+<h1 class="title">Ben Bartlett</h1>
+</header>
+<p>Hey there! Ben Bartlett here. I am a Backend software engineer working in the healthcare industry. At night I am a hobbyist developer of 2D games. When I’m not coding I would find myself cooking, reading engaging novels, and going on the occasional hike or two.</p>
+<h2 id="my-links">My Links</h2>
+<ul>
+<li><a href="https://ben-bartlett.me.uk">My website</a></li>
+<li><a href="https://github.com/ben-bartlett">GitHub</a></li>
+<li><a href="https://mastodon.ben-bartlett.me.uk">Mastodon</a></li>
+<li><a href="/blog">My blog</a></li>
+</ul>
+<h2 id="projects-im-working-on">Projects I’m working on</h2>
+<ul>
+<li><a href="/projects/orange-juice">Orange Juice</a></li>
+<li><a href="/projects/mustangs">Mustangs</a></li>
+<li><a href="/projects/honeycombs">Honeycombs</a></li>
+</ul>
+</body>
+</html>
--- a/tests/GetURLFromHTML/blog.boot.dev.html
+++ b/tests/GetURLFromHTML/blog.boot.dev.html
@ -0,0 +1,10 @@
+<html>
+	<body>
+		<a href="/path/one">
+			<span>Boot.dev</span>
+		</a>
+		<a href="https://other.com/path/one">
+			<span>Boot.dev</span>
+		</a>
+	</body>
+</html>
--- a/tests/GetURLFromHTML/my-simple-cooking-website.html
+++ b/tests/GetURLFromHTML/my-simple-cooking-website.html
@ -0,0 +1,37 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
+<head>
+  <meta charset="utf-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
+  <title>My simple cooking website</title>
+  <style>
+    code{white-space: pre-wrap;}
+    span.smallcaps{font-variant: small-caps;}
+    span.underline{text-decoration: underline;}
+    div.column{display: inline-block; vertical-align: top; width: 50%;}
+    div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+    ul.task-list{list-style: none;}
+  </style>
+</head>
+<body>
+<header id="title-block-header">
+<h1 class="title">My simple cooking website</h1>
+</header>
+<p>Find my favourite recipes here.</p>
+<h2 id="recipes">Recipes</h2>
+<ul>
+<li><a href="/recipes/sweet-n-sour-kung-pao-style-chicken">Sweet ‘n’ Sour Kung Pao-Style Chicken</a></li>
+<li><a href="/recipes/beef-and-broccoli">Beef and Broccoli</a></li>
+<li><a href="/recipes/asian-glazed-salmon">Asian Glazed Salmon</a></li>
+<li><a href="/recipes/caesar-salad">Caesar Salad</a></li>
+<li><a href="/recipes/simple-tuna-salad">Simple Tuna Salad</a></li>
+<li><a href="/recipes/wholemeal-pizza">Wholemeal Pizza</a></li>
+</ul>
+<h2 id="links">Links</h2>
+<ul>
+<li><a href="/news">News</a></li>
+<li><a href="/about/contact">Contact</a></li>
+<li><a href="https://the-other-site.example.new/home">The other site</a></li>
+</ul>
+</body>
+</html>