diff --git a/html.go b/internal/util/html.go
similarity index 95%
rename from html.go
rename to internal/util/html.go
index b65ab62..52716e3 100644
--- a/html.go
+++ b/internal/util/html.go
@@ -1,4 +1,4 @@
-package main
+package util
import (
"fmt"
@@ -8,7 +8,7 @@ import (
"golang.org/x/net/html"
)
-func getURLsFromHTML(htmlBody, rawBaseURL string) ([]string, error) {
+func GetURLsFromHTML(htmlBody, rawBaseURL string) ([]string, error) {
htmlDoc, err := html.Parse(strings.NewReader(htmlBody))
if err != nil {
return []string{}, fmt.Errorf("unable to parse the HTML document: %w", err)
diff --git a/html_test.go b/internal/util/html_test.go
similarity index 87%
rename from html_test.go
rename to internal/util/html_test.go
index c6d30d2..479a966 100644
--- a/html_test.go
+++ b/internal/util/html_test.go
@@ -1,10 +1,12 @@
-package main
+package util_test
import (
"os"
"reflect"
"slices"
"testing"
+
+ "codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
)
func TestGetURLsFromHTML(t *testing.T) {
@@ -18,7 +20,7 @@ func TestGetURLsFromHTML(t *testing.T) {
}{
{
name: "HTML documentation using blog.boot.dev",
- filepath: "tests/GetURLFromHTML/blog.boot.dev.html",
+ filepath: "testdata/GetURLFromHTML/blog.boot.dev.html",
baseURL: "https://blog.boot.dev",
want: []string{
"https://blog.boot.dev/path/one",
@@ -27,7 +29,7 @@ func TestGetURLsFromHTML(t *testing.T) {
},
{
name: "HTML documentation using https://ben-bartlett.me.uk",
- filepath: "tests/GetURLFromHTML/ben-bartlett.html",
+ filepath: "testdata/GetURLFromHTML/ben-bartlett.html",
baseURL: "https://ben-bartlett.me.uk",
want: []string{
"https://ben-bartlett.me.uk",
@@ -41,7 +43,7 @@ func TestGetURLsFromHTML(t *testing.T) {
},
{
name: "HTML documentation using https://simple.cooking",
- filepath: "tests/GetURLFromHTML/my-simple-cooking-website.html",
+ filepath: "testdata/GetURLFromHTML/my-simple-cooking-website.html",
baseURL: "https://simple.cooking",
want: []string{
"https://simple.cooking/recipes/sweet-n-sour-kung-pao-style-chicken",
@@ -73,7 +75,7 @@ func testGetURLsFromHTML(path, baseURL string, want []string) func(t *testing.T)
t.Fatalf("%s unable to open read data from %s: %v", failedTestPrefix, path, err)
}
- got, err := getURLsFromHTML(string(htmlDoc), baseURL)
+ got, err := util.GetURLsFromHTML(string(htmlDoc), baseURL)
if err != nil {
t.Fatalf(
"Test TestGetURLsFromHTML FAILED: unexpected error: %v",
diff --git a/tests/GetURLFromHTML/ben-bartlett.html b/internal/util/testdata/GetURLFromHTML/ben-bartlett.html
similarity index 100%
rename from tests/GetURLFromHTML/ben-bartlett.html
rename to internal/util/testdata/GetURLFromHTML/ben-bartlett.html
diff --git a/tests/GetURLFromHTML/blog.boot.dev.html b/internal/util/testdata/GetURLFromHTML/blog.boot.dev.html
similarity index 100%
rename from tests/GetURLFromHTML/blog.boot.dev.html
rename to internal/util/testdata/GetURLFromHTML/blog.boot.dev.html
diff --git a/tests/GetURLFromHTML/my-simple-cooking-website.html b/internal/util/testdata/GetURLFromHTML/my-simple-cooking-website.html
similarity index 100%
rename from tests/GetURLFromHTML/my-simple-cooking-website.html
rename to internal/util/testdata/GetURLFromHTML/my-simple-cooking-website.html
diff --git a/internal/util/url.go b/internal/util/url.go
new file mode 100644
index 0000000..5e270c5
--- /dev/null
+++ b/internal/util/url.go
@@ -0,0 +1,18 @@
+package util
+
+import (
+ "fmt"
+ "net/url"
+ "strings"
+)
+
+func NormaliseURL(rawURL string) (string, error) {
+ const normalisedFormat string = "%s%s"
+
+ parsedURL, err := url.Parse(rawURL)
+ if err != nil {
+ return "", fmt.Errorf("error parsing the URL %q: %w", rawURL, err)
+ }
+
+ return fmt.Sprintf(normalisedFormat, parsedURL.Hostname(), strings.TrimSuffix(parsedURL.Path, "/")), nil
+}
diff --git a/url_test.go b/internal/util/url_test.go
similarity index 50%
rename from url_test.go
rename to internal/util/url_test.go
index e36d64a..99f591b 100644
--- a/url_test.go
+++ b/internal/util/url_test.go
@@ -1,8 +1,10 @@
-package main
+package util_test
import (
"slices"
"testing"
+
+ "codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
)
func TestNormaliseURL(t *testing.T) {
@@ -48,7 +50,7 @@ func TestNormaliseURL(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
- got, err := normaliseURL(tc.inputURL)
+ got, err := util.NormaliseURL(tc.inputURL)
if err != nil {
t.Fatalf(
"Test %d - '%s' FAILED: unexpected error: %v",
@@ -77,70 +79,3 @@ func TestNormaliseURL(t *testing.T) {
})
}
}
-
-func TestEqualDomains(t *testing.T) {
- t.Parallel()
-
- cases := []struct {
- name string
- urlA string
- urlB string
- want bool
- }{
- {
- name: "Same domain, different paths",
- urlA: "https://example.com/news",
- urlB: "https://example.com/about/contact",
- want: true,
- },
- {
- name: "Different domains, same path",
- urlA: "http://example.com/blog",
- urlB: "http://example.org/blog",
- want: false,
- },
- {
- name: "Same domain, different protocols",
- urlA: "http://code.person.me.uk/projects/orion",
- urlB: "https://code.person.me.uk/user/person/README.md",
- want: true,
- },
- }
-
- for ind, tc := range slices.All(cases) {
- t.Run(tc.name, testEqualDomains(ind+1, tc.name, tc.urlA, tc.urlB, tc.want))
- }
-}
-
-func testEqualDomains(testNum int, testName, urlA, urlB string, want bool) func(t *testing.T) {
- return func(t *testing.T) {
- t.Parallel()
-
- got, err := equalDomains(urlA, urlB)
- if err != nil {
- t.Fatalf(
- "Test %d - '%s' FAILED: unexpected error: %v",
- testNum,
- testName,
- err,
- )
- }
-
- if got != want {
- t.Errorf(
- "Test %d - '%s' FAILED: unexpected domain comparison received: want %t, got %t",
- testNum,
- testName,
- want,
- got,
- )
- } else {
- t.Logf(
- "Test %d - '%s' PASSED: expected domain comparison received: got %t",
- testNum,
- testName,
- got,
- )
- }
- }
-}
diff --git a/main.go b/main.go
index 4034cc8..2633e44 100644
--- a/main.go
+++ b/main.go
@@ -1,15 +1,10 @@
package main
import (
- "context"
"errors"
"fmt"
- "io"
"maps"
- "net/http"
"os"
- "strings"
- "time"
)
var (
@@ -36,16 +31,16 @@ func run() error {
return errTooManyArgs
}
- baseURL := args[0]
+ //baseURL := args[0]
pages := make(map[string]int)
- var err error
+ //var err error
- pages, err = crawlPage(baseURL, baseURL, pages)
- if err != nil {
- return fmt.Errorf("received an error while crawling the website: %w", err)
- }
+ //pages, err = crawlPage(baseURL, baseURL, pages)
+ //if err != nil {
+ // return fmt.Errorf("received an error while crawling the website: %w", err)
+ //}
fmt.Printf("\n\nRESULTS:\n")
@@ -55,102 +50,3 @@ func run() error {
return nil
}
-
-func crawlPage(rawBaseURL, rawCurrentURL string, pages map[string]int) (map[string]int, error) {
- var err error
-
- // if current URL is not on the same domain as the base URL, return the current pages.
- sameDomain, err := equalDomains(rawBaseURL, rawCurrentURL)
- if err != nil {
- return pages, err
- }
-
- if !sameDomain {
- return pages, nil
- }
-
- // get normalised version of rawCurrentURL
- normalisedCurrentURL, err := normaliseURL(rawCurrentURL)
- if err != nil {
- return pages, err
- }
-
- // check if normalised URL has an entry in pages.
- _, exists := pages[normalisedCurrentURL]
-
- // If it has an entry, increment the count by 1 and return the pages.
- if exists {
- pages[normalisedCurrentURL]++
-
- return pages, nil
- }
-
- // Create an entry for the page
- pages[normalisedCurrentURL] = 1
-
- // Get the HTML from the current URL, print that you are getting the HTML doc from current URL.
- fmt.Printf("Crawling %q\n", rawCurrentURL)
-
- htmlDoc, err := getHTML(rawCurrentURL)
- if err != nil {
- return pages, fmt.Errorf("error retrieving the HTML document from %q: %w", rawCurrentURL, err)
- }
-
- // Get all the URLs from the HTML doc.
- links, err := getURLsFromHTML(htmlDoc, rawBaseURL)
- if err != nil {
- return pages, fmt.Errorf("error retrieving the links from the HTML document: %w", err)
- }
-
- // Recursively crawl each URL on the page. (add a timeout?)
- for ind := range len(links) {
- time.Sleep(time.Duration(1 * time.Second))
-
- pages, err = crawlPage(rawBaseURL, links[ind], pages)
- if err != nil {
- fmt.Println("WARNING: error received while crawling %q: %v", links[ind], err)
- }
- }
-
- return pages, nil
-}
-
-func getHTML(rawURL string) (string, error) {
- ctx, cancel := context.WithTimeout(context.Background(), time.Duration(10*time.Second))
- defer cancel()
-
- request, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
- if err != nil {
- return "", fmt.Errorf("error creating the HTTP request: %w", err)
- }
-
- client := http.Client{}
-
- resp, err := client.Do(request)
- if err != nil {
- return "", fmt.Errorf("error getting the response: %w", err)
- }
-
- defer resp.Body.Close()
-
- if resp.StatusCode >= 400 {
- return "", fmt.Errorf(
- "received a bad status from %s: (%d) %s",
- rawURL,
- resp.StatusCode,
- resp.Status,
- )
- }
-
- contentType := resp.Header.Get("content-type")
- if !strings.Contains(contentType, "text/html") {
- return "", fmt.Errorf("unexpected content type received: want text/html, got %s", contentType)
- }
-
- data, err := io.ReadAll(resp.Body)
- if err != nil {
- return "", fmt.Errorf("error reading the data from the response: %w", err)
- }
-
- return string(data), nil
-}
diff --git a/url.go b/url.go
deleted file mode 100644
index e049d56..0000000
--- a/url.go
+++ /dev/null
@@ -1,32 +0,0 @@
-package main
-
-import (
- "fmt"
- "net/url"
- "strings"
-)
-
-func normaliseURL(rawURL string) (string, error) {
- const normalisedFormat string = "%s%s"
-
- parsedURL, err := url.Parse(rawURL)
- if err != nil {
- return "", fmt.Errorf("error parsing the URL %q: %w", rawURL, err)
- }
-
- return fmt.Sprintf(normalisedFormat, parsedURL.Hostname(), strings.TrimSuffix(parsedURL.Path, "/")), nil
-}
-
-func equalDomains(urlA, urlB string) (bool, error) {
- parsedURLA, err := url.Parse(urlA)
- if err != nil {
- return false, fmt.Errorf("error parsing the URL %q: %w", urlA, err)
- }
-
- parsedURLB, err := url.Parse(urlB)
- if err != nil {
- return false, fmt.Errorf("error parsing the URL %q: %w", urlB, err)
- }
-
- return parsedURLA.Hostname() == parsedURLB.Hostname(), nil
-}