From 14dd76d25cca8bbd11dc55e468ea4e1c132e6a4d Mon Sep 17 00:00:00 2001 From: Dan Anglin Date: Tue, 27 Aug 2024 11:47:46 +0100 Subject: [PATCH] checkpoint: a bit of project restructuring --- html.go => internal/util/html.go | 4 +- html_test.go => internal/util/html_test.go | 12 +- .../GetURLFromHTML/ben-bartlett.html | 0 .../GetURLFromHTML/blog.boot.dev.html | 0 .../my-simple-cooking-website.html | 0 internal/util/url.go | 18 +++ url_test.go => internal/util/url_test.go | 73 +---------- main.go | 116 +----------------- url.go | 32 ----- 9 files changed, 37 insertions(+), 218 deletions(-) rename html.go => internal/util/html.go (95%) rename html_test.go => internal/util/html_test.go (87%) rename {tests => internal/util/testdata}/GetURLFromHTML/ben-bartlett.html (100%) rename {tests => internal/util/testdata}/GetURLFromHTML/blog.boot.dev.html (100%) rename {tests => internal/util/testdata}/GetURLFromHTML/my-simple-cooking-website.html (100%) create mode 100644 internal/util/url.go rename url_test.go => internal/util/url_test.go (50%) delete mode 100644 url.go diff --git a/html.go b/internal/util/html.go similarity index 95% rename from html.go rename to internal/util/html.go index b65ab62..52716e3 100644 --- a/html.go +++ b/internal/util/html.go @@ -1,4 +1,4 @@ -package main +package util import ( "fmt" @@ -8,7 +8,7 @@ import ( "golang.org/x/net/html" ) -func getURLsFromHTML(htmlBody, rawBaseURL string) ([]string, error) { +func GetURLsFromHTML(htmlBody, rawBaseURL string) ([]string, error) { htmlDoc, err := html.Parse(strings.NewReader(htmlBody)) if err != nil { return []string{}, fmt.Errorf("unable to parse the HTML document: %w", err) diff --git a/html_test.go b/internal/util/html_test.go similarity index 87% rename from html_test.go rename to internal/util/html_test.go index c6d30d2..479a966 100644 --- a/html_test.go +++ b/internal/util/html_test.go @@ -1,10 +1,12 @@ -package main +package util_test import ( "os" "reflect" "slices" "testing" + + "codeflow.dananglin.me.uk/apollo/web-crawler/internal/util" ) func TestGetURLsFromHTML(t *testing.T) { @@ -18,7 +20,7 @@ func TestGetURLsFromHTML(t *testing.T) { }{ { name: "HTML documentation using blog.boot.dev", - filepath: "tests/GetURLFromHTML/blog.boot.dev.html", + filepath: "testdata/GetURLFromHTML/blog.boot.dev.html", baseURL: "https://blog.boot.dev", want: []string{ "https://blog.boot.dev/path/one", @@ -27,7 +29,7 @@ func TestGetURLsFromHTML(t *testing.T) { }, { name: "HTML documentation using https://ben-bartlett.me.uk", - filepath: "tests/GetURLFromHTML/ben-bartlett.html", + filepath: "testdata/GetURLFromHTML/ben-bartlett.html", baseURL: "https://ben-bartlett.me.uk", want: []string{ "https://ben-bartlett.me.uk", @@ -41,7 +43,7 @@ func TestGetURLsFromHTML(t *testing.T) { }, { name: "HTML documentation using https://simple.cooking", - filepath: "tests/GetURLFromHTML/my-simple-cooking-website.html", + filepath: "testdata/GetURLFromHTML/my-simple-cooking-website.html", baseURL: "https://simple.cooking", want: []string{ "https://simple.cooking/recipes/sweet-n-sour-kung-pao-style-chicken", @@ -73,7 +75,7 @@ func testGetURLsFromHTML(path, baseURL string, want []string) func(t *testing.T) t.Fatalf("%s unable to open read data from %s: %v", failedTestPrefix, path, err) } - got, err := getURLsFromHTML(string(htmlDoc), baseURL) + got, err := util.GetURLsFromHTML(string(htmlDoc), baseURL) if err != nil { t.Fatalf( "Test TestGetURLsFromHTML FAILED: unexpected error: %v", diff --git a/tests/GetURLFromHTML/ben-bartlett.html b/internal/util/testdata/GetURLFromHTML/ben-bartlett.html similarity index 100% rename from tests/GetURLFromHTML/ben-bartlett.html rename to internal/util/testdata/GetURLFromHTML/ben-bartlett.html diff --git a/tests/GetURLFromHTML/blog.boot.dev.html b/internal/util/testdata/GetURLFromHTML/blog.boot.dev.html similarity index 100% rename from tests/GetURLFromHTML/blog.boot.dev.html rename to internal/util/testdata/GetURLFromHTML/blog.boot.dev.html diff --git a/tests/GetURLFromHTML/my-simple-cooking-website.html b/internal/util/testdata/GetURLFromHTML/my-simple-cooking-website.html similarity index 100% rename from tests/GetURLFromHTML/my-simple-cooking-website.html rename to internal/util/testdata/GetURLFromHTML/my-simple-cooking-website.html diff --git a/internal/util/url.go b/internal/util/url.go new file mode 100644 index 0000000..5e270c5 --- /dev/null +++ b/internal/util/url.go @@ -0,0 +1,18 @@ +package util + +import ( + "fmt" + "net/url" + "strings" +) + +func NormaliseURL(rawURL string) (string, error) { + const normalisedFormat string = "%s%s" + + parsedURL, err := url.Parse(rawURL) + if err != nil { + return "", fmt.Errorf("error parsing the URL %q: %w", rawURL, err) + } + + return fmt.Sprintf(normalisedFormat, parsedURL.Hostname(), strings.TrimSuffix(parsedURL.Path, "/")), nil +} diff --git a/url_test.go b/internal/util/url_test.go similarity index 50% rename from url_test.go rename to internal/util/url_test.go index e36d64a..99f591b 100644 --- a/url_test.go +++ b/internal/util/url_test.go @@ -1,8 +1,10 @@ -package main +package util_test import ( "slices" "testing" + + "codeflow.dananglin.me.uk/apollo/web-crawler/internal/util" ) func TestNormaliseURL(t *testing.T) { @@ -48,7 +50,7 @@ func TestNormaliseURL(t *testing.T) { t.Run(tc.name, func(t *testing.T) { t.Parallel() - got, err := normaliseURL(tc.inputURL) + got, err := util.NormaliseURL(tc.inputURL) if err != nil { t.Fatalf( "Test %d - '%s' FAILED: unexpected error: %v", @@ -77,70 +79,3 @@ func TestNormaliseURL(t *testing.T) { }) } } - -func TestEqualDomains(t *testing.T) { - t.Parallel() - - cases := []struct { - name string - urlA string - urlB string - want bool - }{ - { - name: "Same domain, different paths", - urlA: "https://example.com/news", - urlB: "https://example.com/about/contact", - want: true, - }, - { - name: "Different domains, same path", - urlA: "http://example.com/blog", - urlB: "http://example.org/blog", - want: false, - }, - { - name: "Same domain, different protocols", - urlA: "http://code.person.me.uk/projects/orion", - urlB: "https://code.person.me.uk/user/person/README.md", - want: true, - }, - } - - for ind, tc := range slices.All(cases) { - t.Run(tc.name, testEqualDomains(ind+1, tc.name, tc.urlA, tc.urlB, tc.want)) - } -} - -func testEqualDomains(testNum int, testName, urlA, urlB string, want bool) func(t *testing.T) { - return func(t *testing.T) { - t.Parallel() - - got, err := equalDomains(urlA, urlB) - if err != nil { - t.Fatalf( - "Test %d - '%s' FAILED: unexpected error: %v", - testNum, - testName, - err, - ) - } - - if got != want { - t.Errorf( - "Test %d - '%s' FAILED: unexpected domain comparison received: want %t, got %t", - testNum, - testName, - want, - got, - ) - } else { - t.Logf( - "Test %d - '%s' PASSED: expected domain comparison received: got %t", - testNum, - testName, - got, - ) - } - } -} diff --git a/main.go b/main.go index 4034cc8..2633e44 100644 --- a/main.go +++ b/main.go @@ -1,15 +1,10 @@ package main import ( - "context" "errors" "fmt" - "io" "maps" - "net/http" "os" - "strings" - "time" ) var ( @@ -36,16 +31,16 @@ func run() error { return errTooManyArgs } - baseURL := args[0] + //baseURL := args[0] pages := make(map[string]int) - var err error + //var err error - pages, err = crawlPage(baseURL, baseURL, pages) - if err != nil { - return fmt.Errorf("received an error while crawling the website: %w", err) - } + //pages, err = crawlPage(baseURL, baseURL, pages) + //if err != nil { + // return fmt.Errorf("received an error while crawling the website: %w", err) + //} fmt.Printf("\n\nRESULTS:\n") @@ -55,102 +50,3 @@ func run() error { return nil } - -func crawlPage(rawBaseURL, rawCurrentURL string, pages map[string]int) (map[string]int, error) { - var err error - - // if current URL is not on the same domain as the base URL, return the current pages. - sameDomain, err := equalDomains(rawBaseURL, rawCurrentURL) - if err != nil { - return pages, err - } - - if !sameDomain { - return pages, nil - } - - // get normalised version of rawCurrentURL - normalisedCurrentURL, err := normaliseURL(rawCurrentURL) - if err != nil { - return pages, err - } - - // check if normalised URL has an entry in pages. - _, exists := pages[normalisedCurrentURL] - - // If it has an entry, increment the count by 1 and return the pages. - if exists { - pages[normalisedCurrentURL]++ - - return pages, nil - } - - // Create an entry for the page - pages[normalisedCurrentURL] = 1 - - // Get the HTML from the current URL, print that you are getting the HTML doc from current URL. - fmt.Printf("Crawling %q\n", rawCurrentURL) - - htmlDoc, err := getHTML(rawCurrentURL) - if err != nil { - return pages, fmt.Errorf("error retrieving the HTML document from %q: %w", rawCurrentURL, err) - } - - // Get all the URLs from the HTML doc. - links, err := getURLsFromHTML(htmlDoc, rawBaseURL) - if err != nil { - return pages, fmt.Errorf("error retrieving the links from the HTML document: %w", err) - } - - // Recursively crawl each URL on the page. (add a timeout?) - for ind := range len(links) { - time.Sleep(time.Duration(1 * time.Second)) - - pages, err = crawlPage(rawBaseURL, links[ind], pages) - if err != nil { - fmt.Println("WARNING: error received while crawling %q: %v", links[ind], err) - } - } - - return pages, nil -} - -func getHTML(rawURL string) (string, error) { - ctx, cancel := context.WithTimeout(context.Background(), time.Duration(10*time.Second)) - defer cancel() - - request, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) - if err != nil { - return "", fmt.Errorf("error creating the HTTP request: %w", err) - } - - client := http.Client{} - - resp, err := client.Do(request) - if err != nil { - return "", fmt.Errorf("error getting the response: %w", err) - } - - defer resp.Body.Close() - - if resp.StatusCode >= 400 { - return "", fmt.Errorf( - "received a bad status from %s: (%d) %s", - rawURL, - resp.StatusCode, - resp.Status, - ) - } - - contentType := resp.Header.Get("content-type") - if !strings.Contains(contentType, "text/html") { - return "", fmt.Errorf("unexpected content type received: want text/html, got %s", contentType) - } - - data, err := io.ReadAll(resp.Body) - if err != nil { - return "", fmt.Errorf("error reading the data from the response: %w", err) - } - - return string(data), nil -} diff --git a/url.go b/url.go deleted file mode 100644 index e049d56..0000000 --- a/url.go +++ /dev/null @@ -1,32 +0,0 @@ -package main - -import ( - "fmt" - "net/url" - "strings" -) - -func normaliseURL(rawURL string) (string, error) { - const normalisedFormat string = "%s%s" - - parsedURL, err := url.Parse(rawURL) - if err != nil { - return "", fmt.Errorf("error parsing the URL %q: %w", rawURL, err) - } - - return fmt.Sprintf(normalisedFormat, parsedURL.Hostname(), strings.TrimSuffix(parsedURL.Path, "/")), nil -} - -func equalDomains(urlA, urlB string) (bool, error) { - parsedURLA, err := url.Parse(urlA) - if err != nil { - return false, fmt.Errorf("error parsing the URL %q: %w", urlA, err) - } - - parsedURLB, err := url.Parse(urlB) - if err != nil { - return false, fmt.Errorf("error parsing the URL %q: %w", urlB, err) - } - - return parsedURLA.Hostname() == parsedURLB.Hostname(), nil -}