generated from templates/go-generic
wip
This commit is contained in:
parent
5d447923b1
commit
235132d0cc
16 changed files with 390 additions and 15 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -1,2 +1 @@
|
||||||
/__build/*
|
crawler
|
||||||
!__build/.gitkeep
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ run:
|
||||||
tests: true
|
tests: true
|
||||||
|
|
||||||
output:
|
output:
|
||||||
format: colored-line-number
|
formats: colored-line-number
|
||||||
print-issues-lines: true
|
print-issues-lines: true
|
||||||
print-linter-name: true
|
print-linter-name: true
|
||||||
uniq-by-line: true
|
uniq-by-line: true
|
||||||
|
@ -18,5 +18,7 @@ linters-settings:
|
||||||
|
|
||||||
linters:
|
linters:
|
||||||
enable-all: true
|
enable-all: true
|
||||||
# disable:
|
disable:
|
||||||
|
- execinquery
|
||||||
|
- gomnd
|
||||||
fast: false
|
fast: false
|
||||||
|
|
70
get_urls_from_html.go
Normal file
70
get_urls_from_html.go
Normal file
|
@ -0,0 +1,70 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"net/url"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
)
|
||||||
|
|
||||||
|
func getURLsFromHTML(htmlBody, rawBaseURL string) ([]string, error) {
|
||||||
|
htmlDoc, err := html.Parse(strings.NewReader(htmlBody))
|
||||||
|
if err != nil {
|
||||||
|
return []string{}, fmt.Errorf("unable to parse the HTML document: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
parsedRawBaseURL, err := url.Parse(rawBaseURL)
|
||||||
|
if err != nil {
|
||||||
|
return []string{}, fmt.Errorf("unable to parse the raw base URL %q: %w", rawBaseURL, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
output := make([]string, 0, 3)
|
||||||
|
|
||||||
|
var extractLinkFunc func(*html.Node) error
|
||||||
|
|
||||||
|
extractLinkFunc = func(node *html.Node) error {
|
||||||
|
if node.Type == html.ElementNode && node.Data == "a" {
|
||||||
|
for _, a := range node.Attr {
|
||||||
|
if a.Key == "href" {
|
||||||
|
extractedURL, err := getAbsoluteURL(a.Val, parsedRawBaseURL)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("unable to get the absolute URL of %s: %w", a.Val, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
output = append(output, extractedURL)
|
||||||
|
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
||||||
|
if err := extractLinkFunc(c); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := extractLinkFunc(htmlDoc); err != nil {
|
||||||
|
return []string{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return output, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getAbsoluteURL(inputURL string, baseURL *url.URL) (string, error) {
|
||||||
|
parsedURL, err := url.Parse(inputURL)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("unable to parse the URL from %s: %w", inputURL, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if parsedURL.Scheme == "" && parsedURL.Host == "" {
|
||||||
|
parsedURL.Scheme = baseURL.Scheme
|
||||||
|
parsedURL.Host = baseURL.Host
|
||||||
|
}
|
||||||
|
|
||||||
|
return parsedURL.String(), nil
|
||||||
|
}
|
97
get_urls_from_html_test.go
Normal file
97
get_urls_from_html_test.go
Normal file
|
@ -0,0 +1,97 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"reflect"
|
||||||
|
"slices"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestGetURLsFromHTML(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
filepath string
|
||||||
|
baseURL string
|
||||||
|
want []string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "HTML documentation using blog.boot.dev",
|
||||||
|
filepath: "tests/GetURLFromHTML/blog.boot.dev.html",
|
||||||
|
baseURL: "https://blog.boot.dev",
|
||||||
|
want: []string{
|
||||||
|
"https://blog.boot.dev/path/one",
|
||||||
|
"https://other.com/path/one",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "HTML documentation using https://ben-bartlett.me.uk",
|
||||||
|
filepath: "tests/GetURLFromHTML/ben-bartlett.html",
|
||||||
|
baseURL: "https://ben-bartlett.me.uk",
|
||||||
|
want: []string{
|
||||||
|
"https://ben-bartlett.me.uk",
|
||||||
|
"https://github.com/ben-bartlett",
|
||||||
|
"https://mastodon.ben-bartlett.me.uk",
|
||||||
|
"https://ben-bartlett.me.uk/blog",
|
||||||
|
"https://ben-bartlett.me.uk/projects/orange-juice",
|
||||||
|
"https://ben-bartlett.me.uk/projects/mustangs",
|
||||||
|
"https://ben-bartlett.me.uk/projects/honeycombs",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "HTML documentation using https://simple.cooking",
|
||||||
|
filepath: "tests/GetURLFromHTML/my-simple-cooking-website.html",
|
||||||
|
baseURL: "https://simple.cooking",
|
||||||
|
want: []string{
|
||||||
|
"https://simple.cooking/recipes/sweet-n-sour-kung-pao-style-chicken",
|
||||||
|
"https://simple.cooking/recipes/beef-and-broccoli",
|
||||||
|
"https://simple.cooking/recipes/asian-glazed-salmon",
|
||||||
|
"https://simple.cooking/recipes/caesar-salad",
|
||||||
|
"https://simple.cooking/recipes/simple-tuna-salad",
|
||||||
|
"https://simple.cooking/recipes/wholemeal-pizza",
|
||||||
|
"https://simple.cooking/news",
|
||||||
|
"https://simple.cooking/about/contact",
|
||||||
|
"https://the-other-site.example.new/home",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range slices.All(cases) {
|
||||||
|
t.Run(tc.name, testGetURLsFromHTML(tc.filepath, tc.baseURL, tc.want))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func testGetURLsFromHTML(path, baseURL string, want []string) func(t *testing.T) {
|
||||||
|
failedTestPrefix := "Test TestGetURLsFromHTML FAILED:"
|
||||||
|
|
||||||
|
return func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
htmlDoc, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("%s unable to open read data from %s: %v", failedTestPrefix, path, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
got, err := getURLsFromHTML(string(htmlDoc), baseURL)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf(
|
||||||
|
"Test TestGetURLsFromHTML FAILED: unexpected error: %v",
|
||||||
|
err,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !reflect.DeepEqual(want, got) {
|
||||||
|
t.Errorf(
|
||||||
|
"Test TestGetURLsFromHTML FAILED: unexpected URLs found in HTML body: want %v, got %v",
|
||||||
|
want,
|
||||||
|
got,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
t.Logf(
|
||||||
|
"Test TestGetURLsFromHTML PASSED: expected URLs found in HTML body: got %v",
|
||||||
|
got,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
5
go.mod
Normal file
5
go.mod
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
module codeflow.dananglin.me.uk/apollo/web-crawler
|
||||||
|
|
||||||
|
go 1.23.0
|
||||||
|
|
||||||
|
require golang.org/x/net v0.28.0
|
2
go.sum
Normal file
2
go.sum
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE=
|
||||||
|
golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg=
|
5
magefiles/go.mod
Normal file
5
magefiles/go.mod
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
module codeflow.dananglin.me.uk/apollo/web-crawler/magefiles
|
||||||
|
|
||||||
|
go 1.23.0
|
||||||
|
|
||||||
|
require github.com/magefile/mage v1.15.0
|
2
magefiles/go.sum
Normal file
2
magefiles/go.sum
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
github.com/magefile/mage v1.15.0 h1:BvGheCMAsG3bWUDbZ8AyXXpCNwU9u5CB6sM+HNb9HYg=
|
||||||
|
github.com/magefile/mage v1.15.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A=
|
|
@ -14,23 +14,23 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
app = "binary"
|
app = "crawler"
|
||||||
defaultInstallPrefix = "/usr/local"
|
defaultInstallPrefix = "/usr/local"
|
||||||
envInstallPrefix = "PROJECT_INSTALL_PREFIX"
|
envInstallPrefix = "CRAWLER_INSTALL_PREFIX"
|
||||||
envTestVerbose = "PROJECT_TEST_VERBOSE"
|
envTestVerbose = "CRAWLER_TEST_VERBOSE"
|
||||||
envTestCover = "PROJECT_TEST_COVER"
|
envTestCover = "CRAWLER_TEST_COVER"
|
||||||
envBuildRebuildAll = "PROJECT_BUILD_REBUILD_ALL"
|
envBuildRebuildAll = "CRAWLER_BUILD_REBUILD_ALL"
|
||||||
envBuildVerbose = "PROJECT_BUILD_VERBOSE"
|
envBuildVerbose = "CRAWLER_BUILD_VERBOSE"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
Default = Build
|
Default = Build
|
||||||
binary = "./__build/" + app
|
binary = app
|
||||||
)
|
)
|
||||||
|
|
||||||
// Test run the go tests.
|
// Test run the go tests.
|
||||||
// To enable verbose mode set PROJECT_TEST_VERBOSE=1.
|
// To enable verbose mode set CRAWLER_TEST_VERBOSE=1.
|
||||||
// To enable coverage mode set PROJECT_TEST_COVER=1.
|
// To enable coverage mode set CRAWLER_TEST_COVER=1.
|
||||||
func Test() error {
|
func Test() error {
|
||||||
goTest := sh.RunCmd("go", "test")
|
goTest := sh.RunCmd("go", "test")
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ func Lint() error {
|
||||||
// To rebuild packages that are already up-to-date set PROJECT_BUILD_REBUILD_ALL=1
|
// To rebuild packages that are already up-to-date set PROJECT_BUILD_REBUILD_ALL=1
|
||||||
// To enable verbose mode set PROJECT_BUILD_VERBOSE=1
|
// To enable verbose mode set PROJECT_BUILD_VERBOSE=1
|
||||||
func Build() error {
|
func Build() error {
|
||||||
main := "main.go"
|
main := "."
|
||||||
flags := ldflags()
|
flags := ldflags()
|
||||||
build := sh.RunCmd("go", "build")
|
build := sh.RunCmd("go", "build")
|
||||||
args := []string{"-ldflags=" + flags, "-o", binary}
|
args := []string{"-ldflags=" + flags, "-o", binary}
|
||||||
|
|
17
main.go
17
main.go
|
@ -1,6 +1,7 @@
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
)
|
)
|
||||||
|
@ -14,11 +15,25 @@ var (
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
if err := run(); err != nil {
|
if err := run(); err != nil {
|
||||||
fmt.Printf("ERROR: %v.\n", err)
|
fmt.Println(err)
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func run() error {
|
func run() error {
|
||||||
|
args := os.Args[1:]
|
||||||
|
|
||||||
|
if len(args) == 0 {
|
||||||
|
return errors.New("no website provided")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(args) > 1 {
|
||||||
|
return errors.New("too many arguments provided")
|
||||||
|
}
|
||||||
|
|
||||||
|
baseURL := args[0]
|
||||||
|
|
||||||
|
fmt.Printf("starting crawl of: %s\n", baseURL)
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
18
normalise_url.go
Normal file
18
normalise_url.go
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"net/url"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func normaliseURL(input string) (string, error) {
|
||||||
|
const normalisedFormat string = "%s%s"
|
||||||
|
|
||||||
|
parsedURL, err := url.Parse(input)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("error parsing the URL %q: %w", input, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Sprintf(normalisedFormat, parsedURL.Hostname(), strings.TrimSuffix(parsedURL.Path, "/")), nil
|
||||||
|
}
|
79
normalise_url_test.go
Normal file
79
normalise_url_test.go
Normal file
|
@ -0,0 +1,79 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"slices"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestNormaliseURL(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
wantNormalisedURL := "blog.boot.dev/path"
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
inputURL string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "remove HTTPS scheme",
|
||||||
|
inputURL: "https://blog.boot.dev/path",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "remove HTTP scheme",
|
||||||
|
inputURL: "http://blog.boot.dev/path",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "remove HTTPS scheme with a trailing slash",
|
||||||
|
inputURL: "https://blog.boot.dev/path/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "remove HTTP scheme with a trailing slash",
|
||||||
|
inputURL: "http://blog.boot.dev/path/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "remove HTTPS scheme with port 443",
|
||||||
|
inputURL: "https://blog.boot.dev:443/path",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "remove HTTP scheme with port 80",
|
||||||
|
inputURL: "http://blog.boot.dev:80/path",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "normalised URL",
|
||||||
|
inputURL: "blog.boot.dev/path",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for ind, tc := range slices.All(cases) {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got, err := normaliseURL(tc.inputURL)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf(
|
||||||
|
"Test %v - '%s' FAILED: unexpected error: %v",
|
||||||
|
ind,
|
||||||
|
tc.name,
|
||||||
|
err,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
if got != wantNormalisedURL {
|
||||||
|
t.Errorf(
|
||||||
|
"Test %d - %s PASSED: unexpected normalised URL returned: want %s, got %s",
|
||||||
|
ind,
|
||||||
|
tc.name,
|
||||||
|
wantNormalisedURL,
|
||||||
|
got,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
t.Logf(
|
||||||
|
"Test %d - %s PASSED: expected normalised URL returned: got %s",
|
||||||
|
ind,
|
||||||
|
tc.name,
|
||||||
|
got,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
34
tests/GetURLFromHTML/ben-bartlett.html
Normal file
34
tests/GetURLFromHTML/ben-bartlett.html
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<title>Ben Bartlett</title>
|
||||||
|
<style>
|
||||||
|
code{white-space: pre-wrap;}
|
||||||
|
span.smallcaps{font-variant: small-caps;}
|
||||||
|
span.underline{text-decoration: underline;}
|
||||||
|
div.column{display: inline-block; vertical-align: top; width: 50%;}
|
||||||
|
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
|
||||||
|
ul.task-list{list-style: none;}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<header id="title-block-header">
|
||||||
|
<h1 class="title">Ben Bartlett</h1>
|
||||||
|
</header>
|
||||||
|
<p>Hey there! Ben Bartlett here. I am a Backend software engineer working in the healthcare industry. At night I am a hobbyist developer of 2D games. When I’m not coding I would find myself cooking, reading engaging novels, and going on the occasional hike or two.</p>
|
||||||
|
<h2 id="my-links">My Links</h2>
|
||||||
|
<ul>
|
||||||
|
<li><a href="https://ben-bartlett.me.uk">My website</a></li>
|
||||||
|
<li><a href="https://github.com/ben-bartlett">GitHub</a></li>
|
||||||
|
<li><a href="https://mastodon.ben-bartlett.me.uk">Mastodon</a></li>
|
||||||
|
<li><a href="/blog">My blog</a></li>
|
||||||
|
</ul>
|
||||||
|
<h2 id="projects-im-working-on">Projects I’m working on</h2>
|
||||||
|
<ul>
|
||||||
|
<li><a href="/projects/orange-juice">Orange Juice</a></li>
|
||||||
|
<li><a href="/projects/mustangs">Mustangs</a></li>
|
||||||
|
<li><a href="/projects/honeycombs">Honeycombs</a></li>
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
10
tests/GetURLFromHTML/blog.boot.dev.html
Normal file
10
tests/GetURLFromHTML/blog.boot.dev.html
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<a href="/path/one">
|
||||||
|
<span>Boot.dev</span>
|
||||||
|
</a>
|
||||||
|
<a href="https://other.com/path/one">
|
||||||
|
<span>Boot.dev</span>
|
||||||
|
</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
37
tests/GetURLFromHTML/my-simple-cooking-website.html
Normal file
37
tests/GetURLFromHTML/my-simple-cooking-website.html
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
|
||||||
|
<title>My simple cooking website</title>
|
||||||
|
<style>
|
||||||
|
code{white-space: pre-wrap;}
|
||||||
|
span.smallcaps{font-variant: small-caps;}
|
||||||
|
span.underline{text-decoration: underline;}
|
||||||
|
div.column{display: inline-block; vertical-align: top; width: 50%;}
|
||||||
|
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
|
||||||
|
ul.task-list{list-style: none;}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<header id="title-block-header">
|
||||||
|
<h1 class="title">My simple cooking website</h1>
|
||||||
|
</header>
|
||||||
|
<p>Find my favourite recipes here.</p>
|
||||||
|
<h2 id="recipes">Recipes</h2>
|
||||||
|
<ul>
|
||||||
|
<li><a href="/recipes/sweet-n-sour-kung-pao-style-chicken">Sweet ‘n’ Sour Kung Pao-Style Chicken</a></li>
|
||||||
|
<li><a href="/recipes/beef-and-broccoli">Beef and Broccoli</a></li>
|
||||||
|
<li><a href="/recipes/asian-glazed-salmon">Asian Glazed Salmon</a></li>
|
||||||
|
<li><a href="/recipes/caesar-salad">Caesar Salad</a></li>
|
||||||
|
<li><a href="/recipes/simple-tuna-salad">Simple Tuna Salad</a></li>
|
||||||
|
<li><a href="/recipes/wholemeal-pizza">Wholemeal Pizza</a></li>
|
||||||
|
</ul>
|
||||||
|
<h2 id="links">Links</h2>
|
||||||
|
<ul>
|
||||||
|
<li><a href="/news">News</a></li>
|
||||||
|
<li><a href="/about/contact">Contact</a></li>
|
||||||
|
<li><a href="https://the-other-site.example.new/home">The other site</a></li>
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
Loading…
Reference in a new issue