checkpoint: a bit of project restructuring

2024-08-27 11:47:46 +01:00 · 2024-08-27 11:47:46 +01:00 · 14dd76d25c
commit 14dd76d25c
parent 1d493e80c7
9 changed files with 37 additions and 218 deletions
--- a/internal/util/html.go
+++ b/internal/util/html.go
@ -1,4 +1,4 @@
-package main
+package util

 import (
 	"fmt"
@ -8,7 +8,7 @@ import (
 	"golang.org/x/net/html"
 )

-func getURLsFromHTML(htmlBody, rawBaseURL string) ([]string, error) {
+func GetURLsFromHTML(htmlBody, rawBaseURL string) ([]string, error) {
 	htmlDoc, err := html.Parse(strings.NewReader(htmlBody))
 	if err != nil {
 		return []string{}, fmt.Errorf("unable to parse the HTML document: %w", err)
--- a/internal/util/html_test.go
+++ b/internal/util/html_test.go
@ -1,10 +1,12 @@
-package main
+package util_test

 import (
 	"os"
 	"reflect"
 	"slices"
 	"testing"
+
+	"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
 )

 func TestGetURLsFromHTML(t *testing.T) {
@ -18,7 +20,7 @@ func TestGetURLsFromHTML(t *testing.T) {
 	}{
 		{
 			name:     "HTML documentation using blog.boot.dev",
-			filepath: "tests/GetURLFromHTML/blog.boot.dev.html",
+			filepath: "testdata/GetURLFromHTML/blog.boot.dev.html",
 			baseURL:  "https://blog.boot.dev",
 			want: []string{
 				"https://blog.boot.dev/path/one",
@ -27,7 +29,7 @@ func TestGetURLsFromHTML(t *testing.T) {
 		},
 		{
 			name:     "HTML documentation using https://ben-bartlett.me.uk",
-			filepath: "tests/GetURLFromHTML/ben-bartlett.html",
+			filepath: "testdata/GetURLFromHTML/ben-bartlett.html",
 			baseURL:  "https://ben-bartlett.me.uk",
 			want: []string{
 				"https://ben-bartlett.me.uk",
@ -41,7 +43,7 @@ func TestGetURLsFromHTML(t *testing.T) {
 		},
 		{
 			name:     "HTML documentation using https://simple.cooking",
-			filepath: "tests/GetURLFromHTML/my-simple-cooking-website.html",
+			filepath: "testdata/GetURLFromHTML/my-simple-cooking-website.html",
 			baseURL:  "https://simple.cooking",
 			want: []string{
 				"https://simple.cooking/recipes/sweet-n-sour-kung-pao-style-chicken",
@ -73,7 +75,7 @@ func testGetURLsFromHTML(path, baseURL string, want []string) func(t *testing.T)
 			t.Fatalf("%s unable to open read data from %s: %v", failedTestPrefix, path, err)
 		}

-		got, err := getURLsFromHTML(string(htmlDoc), baseURL)
+		got, err := util.GetURLsFromHTML(string(htmlDoc), baseURL)
 		if err != nil {
 			t.Fatalf(
 				"Test TestGetURLsFromHTML FAILED: unexpected error: %v",
--- a/internal/util/testdata/GetURLFromHTML/ben-bartlett.html
+++ b/internal/util/testdata/GetURLFromHTML/ben-bartlett.html
--- a/internal/util/testdata/GetURLFromHTML/blog.boot.dev.html
+++ b/internal/util/testdata/GetURLFromHTML/blog.boot.dev.html
--- a/internal/util/testdata/GetURLFromHTML/my-simple-cooking-website.html
+++ b/internal/util/testdata/GetURLFromHTML/my-simple-cooking-website.html
--- a/internal/util/url.go
+++ b/internal/util/url.go
@ -0,0 +1,18 @@
+package util
+
+import (
+	"fmt"
+	"net/url"
+	"strings"
+)
+
+func NormaliseURL(rawURL string) (string, error) {
+	const normalisedFormat string = "%s%s"
+
+	parsedURL, err := url.Parse(rawURL)
+	if err != nil {
+		return "", fmt.Errorf("error parsing the URL %q: %w", rawURL, err)
+	}
+
+	return fmt.Sprintf(normalisedFormat, parsedURL.Hostname(), strings.TrimSuffix(parsedURL.Path, "/")), nil
+}
--- a/internal/util/url_test.go
+++ b/internal/util/url_test.go
@ -1,8 +1,10 @@
-package main
+package util_test

 import (
 	"slices"
 	"testing"
+
+	"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
 )

 func TestNormaliseURL(t *testing.T) {
@ -48,7 +50,7 @@ func TestNormaliseURL(t *testing.T) {
 		t.Run(tc.name, func(t *testing.T) {
 			t.Parallel()

-			got, err := normaliseURL(tc.inputURL)
+			got, err := util.NormaliseURL(tc.inputURL)
 			if err != nil {
 				t.Fatalf(
 					"Test %d - '%s' FAILED: unexpected error: %v",
@ -77,70 +79,3 @@ func TestNormaliseURL(t *testing.T) {
 		})
 	}
 }
-
-func TestEqualDomains(t *testing.T) {
-	t.Parallel()
-
-	cases := []struct {
-		name string
-		urlA string
-		urlB string
-		want bool
-	}{
-		{
-			name: "Same domain, different paths",
-			urlA: "https://example.com/news",
-			urlB: "https://example.com/about/contact",
-			want: true,
-		},
-		{
-			name: "Different domains, same path",
-			urlA: "http://example.com/blog",
-			urlB: "http://example.org/blog",
-			want: false,
-		},
-		{
-			name: "Same domain, different protocols",
-			urlA: "http://code.person.me.uk/projects/orion",
-			urlB: "https://code.person.me.uk/user/person/README.md",
-			want: true,
-		},
-	}
-
-	for ind, tc := range slices.All(cases) {
-		t.Run(tc.name, testEqualDomains(ind+1, tc.name, tc.urlA, tc.urlB, tc.want))
-	}
-}
-
-func testEqualDomains(testNum int, testName, urlA, urlB string, want bool) func(t *testing.T) {
-	return func(t *testing.T) {
-		t.Parallel()
-
-		got, err := equalDomains(urlA, urlB)
-		if err != nil {
-			t.Fatalf(
-				"Test %d - '%s' FAILED: unexpected error: %v",
-				testNum,
-				testName,
-				err,
-			)
-		}
-
-		if got != want {
-			t.Errorf(
-				"Test %d - '%s' FAILED: unexpected domain comparison received: want %t, got %t",
-				testNum,
-				testName,
-				want,
-				got,
-			)
-		} else {
-			t.Logf(
-				"Test %d - '%s' PASSED: expected domain comparison received: got %t",
-				testNum,
-				testName,
-				got,
-			)
-		}
-	}
-}
--- a/main.go
+++ b/main.go
@ -1,15 +1,10 @@
 package main

 import (
-	"context"
 	"errors"
 	"fmt"
-	"io"
 	"maps"
-	"net/http"
 	"os"
-	"strings"
-	"time"
 )

 var (
@ -36,16 +31,16 @@ func run() error {
 		return errTooManyArgs
 	}

-	baseURL := args[0]
+	//baseURL := args[0]

 	pages := make(map[string]int)

-	var err error
+	//var err error

-	pages, err = crawlPage(baseURL, baseURL, pages)
-	if err != nil {
-		return fmt.Errorf("received an error while crawling the website: %w", err)
-	}
+	//pages, err = crawlPage(baseURL, baseURL, pages)
+	//if err != nil {
+	//	return fmt.Errorf("received an error while crawling the website: %w", err)
+	//}

 	fmt.Printf("\n\nRESULTS:\n")

@ -55,102 +50,3 @@ func run() error {

 	return nil
 }
-
-func crawlPage(rawBaseURL, rawCurrentURL string, pages map[string]int) (map[string]int, error) {
-	var err error
-
-	// if current URL is not on the same domain as the base URL, return the current pages.
-	sameDomain, err := equalDomains(rawBaseURL, rawCurrentURL)
-	if err != nil {
-		return pages, err
-	}
-
-	if !sameDomain {
-		return pages, nil
-	}
-
-	// get normalised version of rawCurrentURL
-	normalisedCurrentURL, err := normaliseURL(rawCurrentURL)
-	if err != nil {
-		return pages, err
-	}
-
-	// check if normalised URL has an entry in pages.
-	_, exists := pages[normalisedCurrentURL]
-
-	// If it has an entry, increment the count by 1 and return the pages.
-	if exists {
-		pages[normalisedCurrentURL]++
-
-		return pages, nil
-	}
-
-	// Create an entry for the page
-	pages[normalisedCurrentURL] = 1
-
-	// Get the HTML from the current URL, print that you are getting the HTML doc from current URL.
-	fmt.Printf("Crawling %q\n", rawCurrentURL)
-
-	htmlDoc, err := getHTML(rawCurrentURL)
-	if err != nil {
-		return pages, fmt.Errorf("error retrieving the HTML document from %q: %w", rawCurrentURL, err)
-	}
-
-	// Get all the URLs from the HTML doc.
-	links, err := getURLsFromHTML(htmlDoc, rawBaseURL)
-	if err != nil {
-		return pages, fmt.Errorf("error retrieving the links from the HTML document: %w", err)
-	}
-
-	// Recursively crawl each URL on the page. (add a timeout?)
-	for ind := range len(links) {
-		time.Sleep(time.Duration(1 * time.Second))
-
-		pages, err = crawlPage(rawBaseURL, links[ind], pages)
-		if err != nil {
-			fmt.Println("WARNING: error received while crawling %q: %v", links[ind], err)
-		}
-	}
-
-	return pages, nil
-}
-
-func getHTML(rawURL string) (string, error) {
-	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(10*time.Second))
-	defer cancel()
-
-	request, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
-	if err != nil {
-		return "", fmt.Errorf("error creating the HTTP request: %w", err)
-	}
-
-	client := http.Client{}
-
-	resp, err := client.Do(request)
-	if err != nil {
-		return "", fmt.Errorf("error getting the response: %w", err)
-	}
-
-	defer resp.Body.Close()
-
-	if resp.StatusCode >= 400 {
-		return "", fmt.Errorf(
-			"received a bad status from %s: (%d) %s",
-			rawURL,
-			resp.StatusCode,
-			resp.Status,
-		)
-	}
-
-	contentType := resp.Header.Get("content-type")
-	if !strings.Contains(contentType, "text/html") {
-		return "", fmt.Errorf("unexpected content type received: want text/html, got %s", contentType)
-	}
-
-	data, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return "", fmt.Errorf("error reading the data from the response: %w", err)
-	}
-
-	return string(data), nil
-}
--- a/url.go
+++ b/url.go
@ -1,32 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"net/url"
-	"strings"
-)
-
-func normaliseURL(rawURL string) (string, error) {
-	const normalisedFormat string = "%s%s"
-
-	parsedURL, err := url.Parse(rawURL)
-	if err != nil {
-		return "", fmt.Errorf("error parsing the URL %q: %w", rawURL, err)
-	}
-
-	return fmt.Sprintf(normalisedFormat, parsedURL.Hostname(), strings.TrimSuffix(parsedURL.Path, "/")), nil
-}
-
-func equalDomains(urlA, urlB string) (bool, error) {
-	parsedURLA, err := url.Parse(urlA)
-	if err != nil {
-		return false, fmt.Errorf("error parsing the URL %q: %w", urlA, err)
-	}
-
-	parsedURLB, err := url.Parse(urlB)
-	if err != nil {
-		return false, fmt.Errorf("error parsing the URL %q: %w", urlB, err)
-	}
-
-	return parsedURLA.Hostname() == parsedURLB.Hostname(), nil
-}