generated from templates/go-generic
checkpoint: a bit of project restructuring
This commit is contained in:
parent
1d493e80c7
commit
14dd76d25c
9 changed files with 37 additions and 218 deletions
|
@ -1,4 +1,4 @@
|
||||||
package main
|
package util
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
@ -8,7 +8,7 @@ import (
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
)
|
)
|
||||||
|
|
||||||
func getURLsFromHTML(htmlBody, rawBaseURL string) ([]string, error) {
|
func GetURLsFromHTML(htmlBody, rawBaseURL string) ([]string, error) {
|
||||||
htmlDoc, err := html.Parse(strings.NewReader(htmlBody))
|
htmlDoc, err := html.Parse(strings.NewReader(htmlBody))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return []string{}, fmt.Errorf("unable to parse the HTML document: %w", err)
|
return []string{}, fmt.Errorf("unable to parse the HTML document: %w", err)
|
|
@ -1,10 +1,12 @@
|
||||||
package main
|
package util_test
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"os"
|
"os"
|
||||||
"reflect"
|
"reflect"
|
||||||
"slices"
|
"slices"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestGetURLsFromHTML(t *testing.T) {
|
func TestGetURLsFromHTML(t *testing.T) {
|
||||||
|
@ -18,7 +20,7 @@ func TestGetURLsFromHTML(t *testing.T) {
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "HTML documentation using blog.boot.dev",
|
name: "HTML documentation using blog.boot.dev",
|
||||||
filepath: "tests/GetURLFromHTML/blog.boot.dev.html",
|
filepath: "testdata/GetURLFromHTML/blog.boot.dev.html",
|
||||||
baseURL: "https://blog.boot.dev",
|
baseURL: "https://blog.boot.dev",
|
||||||
want: []string{
|
want: []string{
|
||||||
"https://blog.boot.dev/path/one",
|
"https://blog.boot.dev/path/one",
|
||||||
|
@ -27,7 +29,7 @@ func TestGetURLsFromHTML(t *testing.T) {
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "HTML documentation using https://ben-bartlett.me.uk",
|
name: "HTML documentation using https://ben-bartlett.me.uk",
|
||||||
filepath: "tests/GetURLFromHTML/ben-bartlett.html",
|
filepath: "testdata/GetURLFromHTML/ben-bartlett.html",
|
||||||
baseURL: "https://ben-bartlett.me.uk",
|
baseURL: "https://ben-bartlett.me.uk",
|
||||||
want: []string{
|
want: []string{
|
||||||
"https://ben-bartlett.me.uk",
|
"https://ben-bartlett.me.uk",
|
||||||
|
@ -41,7 +43,7 @@ func TestGetURLsFromHTML(t *testing.T) {
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "HTML documentation using https://simple.cooking",
|
name: "HTML documentation using https://simple.cooking",
|
||||||
filepath: "tests/GetURLFromHTML/my-simple-cooking-website.html",
|
filepath: "testdata/GetURLFromHTML/my-simple-cooking-website.html",
|
||||||
baseURL: "https://simple.cooking",
|
baseURL: "https://simple.cooking",
|
||||||
want: []string{
|
want: []string{
|
||||||
"https://simple.cooking/recipes/sweet-n-sour-kung-pao-style-chicken",
|
"https://simple.cooking/recipes/sweet-n-sour-kung-pao-style-chicken",
|
||||||
|
@ -73,7 +75,7 @@ func testGetURLsFromHTML(path, baseURL string, want []string) func(t *testing.T)
|
||||||
t.Fatalf("%s unable to open read data from %s: %v", failedTestPrefix, path, err)
|
t.Fatalf("%s unable to open read data from %s: %v", failedTestPrefix, path, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
got, err := getURLsFromHTML(string(htmlDoc), baseURL)
|
got, err := util.GetURLsFromHTML(string(htmlDoc), baseURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf(
|
t.Fatalf(
|
||||||
"Test TestGetURLsFromHTML FAILED: unexpected error: %v",
|
"Test TestGetURLsFromHTML FAILED: unexpected error: %v",
|
18
internal/util/url.go
Normal file
18
internal/util/url.go
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
package util
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"net/url"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func NormaliseURL(rawURL string) (string, error) {
|
||||||
|
const normalisedFormat string = "%s%s"
|
||||||
|
|
||||||
|
parsedURL, err := url.Parse(rawURL)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("error parsing the URL %q: %w", rawURL, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Sprintf(normalisedFormat, parsedURL.Hostname(), strings.TrimSuffix(parsedURL.Path, "/")), nil
|
||||||
|
}
|
|
@ -1,8 +1,10 @@
|
||||||
package main
|
package util_test
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"slices"
|
"slices"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"codeflow.dananglin.me.uk/apollo/web-crawler/internal/util"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestNormaliseURL(t *testing.T) {
|
func TestNormaliseURL(t *testing.T) {
|
||||||
|
@ -48,7 +50,7 @@ func TestNormaliseURL(t *testing.T) {
|
||||||
t.Run(tc.name, func(t *testing.T) {
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
got, err := normaliseURL(tc.inputURL)
|
got, err := util.NormaliseURL(tc.inputURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf(
|
t.Fatalf(
|
||||||
"Test %d - '%s' FAILED: unexpected error: %v",
|
"Test %d - '%s' FAILED: unexpected error: %v",
|
||||||
|
@ -77,70 +79,3 @@ func TestNormaliseURL(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestEqualDomains(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
cases := []struct {
|
|
||||||
name string
|
|
||||||
urlA string
|
|
||||||
urlB string
|
|
||||||
want bool
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
name: "Same domain, different paths",
|
|
||||||
urlA: "https://example.com/news",
|
|
||||||
urlB: "https://example.com/about/contact",
|
|
||||||
want: true,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "Different domains, same path",
|
|
||||||
urlA: "http://example.com/blog",
|
|
||||||
urlB: "http://example.org/blog",
|
|
||||||
want: false,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "Same domain, different protocols",
|
|
||||||
urlA: "http://code.person.me.uk/projects/orion",
|
|
||||||
urlB: "https://code.person.me.uk/user/person/README.md",
|
|
||||||
want: true,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for ind, tc := range slices.All(cases) {
|
|
||||||
t.Run(tc.name, testEqualDomains(ind+1, tc.name, tc.urlA, tc.urlB, tc.want))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func testEqualDomains(testNum int, testName, urlA, urlB string, want bool) func(t *testing.T) {
|
|
||||||
return func(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
got, err := equalDomains(urlA, urlB)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf(
|
|
||||||
"Test %d - '%s' FAILED: unexpected error: %v",
|
|
||||||
testNum,
|
|
||||||
testName,
|
|
||||||
err,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
if got != want {
|
|
||||||
t.Errorf(
|
|
||||||
"Test %d - '%s' FAILED: unexpected domain comparison received: want %t, got %t",
|
|
||||||
testNum,
|
|
||||||
testName,
|
|
||||||
want,
|
|
||||||
got,
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
t.Logf(
|
|
||||||
"Test %d - '%s' PASSED: expected domain comparison received: got %t",
|
|
||||||
testNum,
|
|
||||||
testName,
|
|
||||||
got,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
116
main.go
116
main.go
|
@ -1,15 +1,10 @@
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"maps"
|
"maps"
|
||||||
"net/http"
|
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -36,16 +31,16 @@ func run() error {
|
||||||
return errTooManyArgs
|
return errTooManyArgs
|
||||||
}
|
}
|
||||||
|
|
||||||
baseURL := args[0]
|
//baseURL := args[0]
|
||||||
|
|
||||||
pages := make(map[string]int)
|
pages := make(map[string]int)
|
||||||
|
|
||||||
var err error
|
//var err error
|
||||||
|
|
||||||
pages, err = crawlPage(baseURL, baseURL, pages)
|
//pages, err = crawlPage(baseURL, baseURL, pages)
|
||||||
if err != nil {
|
//if err != nil {
|
||||||
return fmt.Errorf("received an error while crawling the website: %w", err)
|
// return fmt.Errorf("received an error while crawling the website: %w", err)
|
||||||
}
|
//}
|
||||||
|
|
||||||
fmt.Printf("\n\nRESULTS:\n")
|
fmt.Printf("\n\nRESULTS:\n")
|
||||||
|
|
||||||
|
@ -55,102 +50,3 @@ func run() error {
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func crawlPage(rawBaseURL, rawCurrentURL string, pages map[string]int) (map[string]int, error) {
|
|
||||||
var err error
|
|
||||||
|
|
||||||
// if current URL is not on the same domain as the base URL, return the current pages.
|
|
||||||
sameDomain, err := equalDomains(rawBaseURL, rawCurrentURL)
|
|
||||||
if err != nil {
|
|
||||||
return pages, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if !sameDomain {
|
|
||||||
return pages, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// get normalised version of rawCurrentURL
|
|
||||||
normalisedCurrentURL, err := normaliseURL(rawCurrentURL)
|
|
||||||
if err != nil {
|
|
||||||
return pages, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// check if normalised URL has an entry in pages.
|
|
||||||
_, exists := pages[normalisedCurrentURL]
|
|
||||||
|
|
||||||
// If it has an entry, increment the count by 1 and return the pages.
|
|
||||||
if exists {
|
|
||||||
pages[normalisedCurrentURL]++
|
|
||||||
|
|
||||||
return pages, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create an entry for the page
|
|
||||||
pages[normalisedCurrentURL] = 1
|
|
||||||
|
|
||||||
// Get the HTML from the current URL, print that you are getting the HTML doc from current URL.
|
|
||||||
fmt.Printf("Crawling %q\n", rawCurrentURL)
|
|
||||||
|
|
||||||
htmlDoc, err := getHTML(rawCurrentURL)
|
|
||||||
if err != nil {
|
|
||||||
return pages, fmt.Errorf("error retrieving the HTML document from %q: %w", rawCurrentURL, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get all the URLs from the HTML doc.
|
|
||||||
links, err := getURLsFromHTML(htmlDoc, rawBaseURL)
|
|
||||||
if err != nil {
|
|
||||||
return pages, fmt.Errorf("error retrieving the links from the HTML document: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Recursively crawl each URL on the page. (add a timeout?)
|
|
||||||
for ind := range len(links) {
|
|
||||||
time.Sleep(time.Duration(1 * time.Second))
|
|
||||||
|
|
||||||
pages, err = crawlPage(rawBaseURL, links[ind], pages)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Println("WARNING: error received while crawling %q: %v", links[ind], err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return pages, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func getHTML(rawURL string) (string, error) {
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(10*time.Second))
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
request, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("error creating the HTTP request: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
client := http.Client{}
|
|
||||||
|
|
||||||
resp, err := client.Do(request)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("error getting the response: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
if resp.StatusCode >= 400 {
|
|
||||||
return "", fmt.Errorf(
|
|
||||||
"received a bad status from %s: (%d) %s",
|
|
||||||
rawURL,
|
|
||||||
resp.StatusCode,
|
|
||||||
resp.Status,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
contentType := resp.Header.Get("content-type")
|
|
||||||
if !strings.Contains(contentType, "text/html") {
|
|
||||||
return "", fmt.Errorf("unexpected content type received: want text/html, got %s", contentType)
|
|
||||||
}
|
|
||||||
|
|
||||||
data, err := io.ReadAll(resp.Body)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("error reading the data from the response: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return string(data), nil
|
|
||||||
}
|
|
||||||
|
|
32
url.go
32
url.go
|
@ -1,32 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"net/url"
|
|
||||||
"strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
func normaliseURL(rawURL string) (string, error) {
|
|
||||||
const normalisedFormat string = "%s%s"
|
|
||||||
|
|
||||||
parsedURL, err := url.Parse(rawURL)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("error parsing the URL %q: %w", rawURL, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return fmt.Sprintf(normalisedFormat, parsedURL.Hostname(), strings.TrimSuffix(parsedURL.Path, "/")), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func equalDomains(urlA, urlB string) (bool, error) {
|
|
||||||
parsedURLA, err := url.Parse(urlA)
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("error parsing the URL %q: %w", urlA, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
parsedURLB, err := url.Parse(urlB)
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("error parsing the URL %q: %w", urlB, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return parsedURLA.Hostname() == parsedURLB.Hostname(), nil
|
|
||||||
}
|
|
Loading…
Reference in a new issue