From 235132d0cc75af527191d2c19c8f667e60000dfd Mon Sep 17 00:00:00 2001 From: Dan Anglin Date: Mon, 26 Aug 2024 18:37:45 +0100 Subject: [PATCH] wip --- .gitignore | 3 +- .golangci.yaml | 6 +- __build/.gitkeep | 0 get_urls_from_html.go | 70 +++++++++++++ get_urls_from_html_test.go | 97 +++++++++++++++++++ go.mod | 5 + go.sum | 2 + magefiles/go.mod | 5 + magefiles/go.sum | 2 + magefiles/mage.go | 20 ++-- main.go | 17 +++- normalise_url.go | 18 ++++ normalise_url_test.go | 79 +++++++++++++++ tests/GetURLFromHTML/ben-bartlett.html | 34 +++++++ tests/GetURLFromHTML/blog.boot.dev.html | 10 ++ .../my-simple-cooking-website.html | 37 +++++++ 16 files changed, 390 insertions(+), 15 deletions(-) delete mode 100644 __build/.gitkeep create mode 100644 get_urls_from_html.go create mode 100644 get_urls_from_html_test.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 magefiles/go.mod create mode 100644 magefiles/go.sum create mode 100644 normalise_url.go create mode 100644 normalise_url_test.go create mode 100644 tests/GetURLFromHTML/ben-bartlett.html create mode 100644 tests/GetURLFromHTML/blog.boot.dev.html create mode 100644 tests/GetURLFromHTML/my-simple-cooking-website.html diff --git a/.gitignore b/.gitignore index e200850..8bd4ac1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1 @@ -/__build/* -!__build/.gitkeep +crawler diff --git a/.golangci.yaml b/.golangci.yaml index 8549273..dd8e3ee 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -6,7 +6,7 @@ run: tests: true output: - format: colored-line-number + formats: colored-line-number print-issues-lines: true print-linter-name: true uniq-by-line: true @@ -18,5 +18,7 @@ linters-settings: linters: enable-all: true - # disable: + disable: + - execinquery + - gomnd fast: false diff --git a/__build/.gitkeep b/__build/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/get_urls_from_html.go b/get_urls_from_html.go new file mode 100644 index 0000000..b65ab62 --- /dev/null +++ b/get_urls_from_html.go @@ -0,0 +1,70 @@ +package main + +import ( + "fmt" + "net/url" + "strings" + + "golang.org/x/net/html" +) + +func getURLsFromHTML(htmlBody, rawBaseURL string) ([]string, error) { + htmlDoc, err := html.Parse(strings.NewReader(htmlBody)) + if err != nil { + return []string{}, fmt.Errorf("unable to parse the HTML document: %w", err) + } + + parsedRawBaseURL, err := url.Parse(rawBaseURL) + if err != nil { + return []string{}, fmt.Errorf("unable to parse the raw base URL %q: %w", rawBaseURL, err) + } + + output := make([]string, 0, 3) + + var extractLinkFunc func(*html.Node) error + + extractLinkFunc = func(node *html.Node) error { + if node.Type == html.ElementNode && node.Data == "a" { + for _, a := range node.Attr { + if a.Key == "href" { + extractedURL, err := getAbsoluteURL(a.Val, parsedRawBaseURL) + if err != nil { + return fmt.Errorf("unable to get the absolute URL of %s: %w", a.Val, err) + } + + output = append(output, extractedURL) + + break + } + } + } + + for c := node.FirstChild; c != nil; c = c.NextSibling { + if err := extractLinkFunc(c); err != nil { + return err + } + } + + return nil + } + + if err := extractLinkFunc(htmlDoc); err != nil { + return []string{}, err + } + + return output, nil +} + +func getAbsoluteURL(inputURL string, baseURL *url.URL) (string, error) { + parsedURL, err := url.Parse(inputURL) + if err != nil { + return "", fmt.Errorf("unable to parse the URL from %s: %w", inputURL, err) + } + + if parsedURL.Scheme == "" && parsedURL.Host == "" { + parsedURL.Scheme = baseURL.Scheme + parsedURL.Host = baseURL.Host + } + + return parsedURL.String(), nil +} diff --git a/get_urls_from_html_test.go b/get_urls_from_html_test.go new file mode 100644 index 0000000..c6d30d2 --- /dev/null +++ b/get_urls_from_html_test.go @@ -0,0 +1,97 @@ +package main + +import ( + "os" + "reflect" + "slices" + "testing" +) + +func TestGetURLsFromHTML(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + filepath string + baseURL string + want []string + }{ + { + name: "HTML documentation using blog.boot.dev", + filepath: "tests/GetURLFromHTML/blog.boot.dev.html", + baseURL: "https://blog.boot.dev", + want: []string{ + "https://blog.boot.dev/path/one", + "https://other.com/path/one", + }, + }, + { + name: "HTML documentation using https://ben-bartlett.me.uk", + filepath: "tests/GetURLFromHTML/ben-bartlett.html", + baseURL: "https://ben-bartlett.me.uk", + want: []string{ + "https://ben-bartlett.me.uk", + "https://github.com/ben-bartlett", + "https://mastodon.ben-bartlett.me.uk", + "https://ben-bartlett.me.uk/blog", + "https://ben-bartlett.me.uk/projects/orange-juice", + "https://ben-bartlett.me.uk/projects/mustangs", + "https://ben-bartlett.me.uk/projects/honeycombs", + }, + }, + { + name: "HTML documentation using https://simple.cooking", + filepath: "tests/GetURLFromHTML/my-simple-cooking-website.html", + baseURL: "https://simple.cooking", + want: []string{ + "https://simple.cooking/recipes/sweet-n-sour-kung-pao-style-chicken", + "https://simple.cooking/recipes/beef-and-broccoli", + "https://simple.cooking/recipes/asian-glazed-salmon", + "https://simple.cooking/recipes/caesar-salad", + "https://simple.cooking/recipes/simple-tuna-salad", + "https://simple.cooking/recipes/wholemeal-pizza", + "https://simple.cooking/news", + "https://simple.cooking/about/contact", + "https://the-other-site.example.new/home", + }, + }, + } + + for _, tc := range slices.All(cases) { + t.Run(tc.name, testGetURLsFromHTML(tc.filepath, tc.baseURL, tc.want)) + } +} + +func testGetURLsFromHTML(path, baseURL string, want []string) func(t *testing.T) { + failedTestPrefix := "Test TestGetURLsFromHTML FAILED:" + + return func(t *testing.T) { + t.Parallel() + + htmlDoc, err := os.ReadFile(path) + if err != nil { + t.Fatalf("%s unable to open read data from %s: %v", failedTestPrefix, path, err) + } + + got, err := getURLsFromHTML(string(htmlDoc), baseURL) + if err != nil { + t.Fatalf( + "Test TestGetURLsFromHTML FAILED: unexpected error: %v", + err, + ) + } + + if !reflect.DeepEqual(want, got) { + t.Errorf( + "Test TestGetURLsFromHTML FAILED: unexpected URLs found in HTML body: want %v, got %v", + want, + got, + ) + } else { + t.Logf( + "Test TestGetURLsFromHTML PASSED: expected URLs found in HTML body: got %v", + got, + ) + } + } +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..6e9f95f --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module codeflow.dananglin.me.uk/apollo/web-crawler + +go 1.23.0 + +require golang.org/x/net v0.28.0 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..e890837 --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= +golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= diff --git a/magefiles/go.mod b/magefiles/go.mod new file mode 100644 index 0000000..402c8a6 --- /dev/null +++ b/magefiles/go.mod @@ -0,0 +1,5 @@ +module codeflow.dananglin.me.uk/apollo/web-crawler/magefiles + +go 1.23.0 + +require github.com/magefile/mage v1.15.0 diff --git a/magefiles/go.sum b/magefiles/go.sum new file mode 100644 index 0000000..4ee1b87 --- /dev/null +++ b/magefiles/go.sum @@ -0,0 +1,2 @@ +github.com/magefile/mage v1.15.0 h1:BvGheCMAsG3bWUDbZ8AyXXpCNwU9u5CB6sM+HNb9HYg= +github.com/magefile/mage v1.15.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A= diff --git a/magefiles/mage.go b/magefiles/mage.go index 1a12267..008a8d7 100644 --- a/magefiles/mage.go +++ b/magefiles/mage.go @@ -14,23 +14,23 @@ import ( ) const ( - app = "binary" + app = "crawler" defaultInstallPrefix = "/usr/local" - envInstallPrefix = "PROJECT_INSTALL_PREFIX" - envTestVerbose = "PROJECT_TEST_VERBOSE" - envTestCover = "PROJECT_TEST_COVER" - envBuildRebuildAll = "PROJECT_BUILD_REBUILD_ALL" - envBuildVerbose = "PROJECT_BUILD_VERBOSE" + envInstallPrefix = "CRAWLER_INSTALL_PREFIX" + envTestVerbose = "CRAWLER_TEST_VERBOSE" + envTestCover = "CRAWLER_TEST_COVER" + envBuildRebuildAll = "CRAWLER_BUILD_REBUILD_ALL" + envBuildVerbose = "CRAWLER_BUILD_VERBOSE" ) var ( Default = Build - binary = "./__build/" + app + binary = app ) // Test run the go tests. -// To enable verbose mode set PROJECT_TEST_VERBOSE=1. -// To enable coverage mode set PROJECT_TEST_COVER=1. +// To enable verbose mode set CRAWLER_TEST_VERBOSE=1. +// To enable coverage mode set CRAWLER_TEST_COVER=1. func Test() error { goTest := sh.RunCmd("go", "test") @@ -56,7 +56,7 @@ func Lint() error { // To rebuild packages that are already up-to-date set PROJECT_BUILD_REBUILD_ALL=1 // To enable verbose mode set PROJECT_BUILD_VERBOSE=1 func Build() error { - main := "main.go" + main := "." flags := ldflags() build := sh.RunCmd("go", "build") args := []string{"-ldflags=" + flags, "-o", binary} diff --git a/main.go b/main.go index a3255e8..86d96a5 100644 --- a/main.go +++ b/main.go @@ -1,6 +1,7 @@ package main import ( + "errors" "fmt" "os" ) @@ -14,11 +15,25 @@ var ( func main() { if err := run(); err != nil { - fmt.Printf("ERROR: %v.\n", err) + fmt.Println(err) os.Exit(1) } } func run() error { + args := os.Args[1:] + + if len(args) == 0 { + return errors.New("no website provided") + } + + if len(args) > 1 { + return errors.New("too many arguments provided") + } + + baseURL := args[0] + + fmt.Printf("starting crawl of: %s\n", baseURL) + return nil } diff --git a/normalise_url.go b/normalise_url.go new file mode 100644 index 0000000..a6ef682 --- /dev/null +++ b/normalise_url.go @@ -0,0 +1,18 @@ +package main + +import ( + "fmt" + "net/url" + "strings" +) + +func normaliseURL(input string) (string, error) { + const normalisedFormat string = "%s%s" + + parsedURL, err := url.Parse(input) + if err != nil { + return "", fmt.Errorf("error parsing the URL %q: %w", input, err) + } + + return fmt.Sprintf(normalisedFormat, parsedURL.Hostname(), strings.TrimSuffix(parsedURL.Path, "/")), nil +} diff --git a/normalise_url_test.go b/normalise_url_test.go new file mode 100644 index 0000000..4df7b4f --- /dev/null +++ b/normalise_url_test.go @@ -0,0 +1,79 @@ +package main + +import ( + "slices" + "testing" +) + +func TestNormaliseURL(t *testing.T) { + t.Parallel() + + wantNormalisedURL := "blog.boot.dev/path" + + cases := []struct { + name string + inputURL string + }{ + { + name: "remove HTTPS scheme", + inputURL: "https://blog.boot.dev/path", + }, + { + name: "remove HTTP scheme", + inputURL: "http://blog.boot.dev/path", + }, + { + name: "remove HTTPS scheme with a trailing slash", + inputURL: "https://blog.boot.dev/path/", + }, + { + name: "remove HTTP scheme with a trailing slash", + inputURL: "http://blog.boot.dev/path/", + }, + { + name: "remove HTTPS scheme with port 443", + inputURL: "https://blog.boot.dev:443/path", + }, + { + name: "remove HTTP scheme with port 80", + inputURL: "http://blog.boot.dev:80/path", + }, + { + name: "normalised URL", + inputURL: "blog.boot.dev/path", + }, + } + + for ind, tc := range slices.All(cases) { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + got, err := normaliseURL(tc.inputURL) + if err != nil { + t.Fatalf( + "Test %v - '%s' FAILED: unexpected error: %v", + ind, + tc.name, + err, + ) + } + + if got != wantNormalisedURL { + t.Errorf( + "Test %d - %s PASSED: unexpected normalised URL returned: want %s, got %s", + ind, + tc.name, + wantNormalisedURL, + got, + ) + } else { + t.Logf( + "Test %d - %s PASSED: expected normalised URL returned: got %s", + ind, + tc.name, + got, + ) + } + }) + } +} diff --git a/tests/GetURLFromHTML/ben-bartlett.html b/tests/GetURLFromHTML/ben-bartlett.html new file mode 100644 index 0000000..f387d08 --- /dev/null +++ b/tests/GetURLFromHTML/ben-bartlett.html @@ -0,0 +1,34 @@ + + + + + Ben Bartlett + + + +
+

Ben Bartlett

+
+

Hey there! Ben Bartlett here. I am a Backend software engineer working in the healthcare industry. At night I am a hobbyist developer of 2D games. When I’m not coding I would find myself cooking, reading engaging novels, and going on the occasional hike or two.

+ + +

Projects I’m working on

+ + + diff --git a/tests/GetURLFromHTML/blog.boot.dev.html b/tests/GetURLFromHTML/blog.boot.dev.html new file mode 100644 index 0000000..853fe1e --- /dev/null +++ b/tests/GetURLFromHTML/blog.boot.dev.html @@ -0,0 +1,10 @@ + + + + Boot.dev + + + Boot.dev + + + diff --git a/tests/GetURLFromHTML/my-simple-cooking-website.html b/tests/GetURLFromHTML/my-simple-cooking-website.html new file mode 100644 index 0000000..28c23b3 --- /dev/null +++ b/tests/GetURLFromHTML/my-simple-cooking-website.html @@ -0,0 +1,37 @@ + + + + + + My simple cooking website + + + +
+

My simple cooking website

+
+

Find my favourite recipes here.

+

Recipes

+ + + + +