web-crawler/main.go

157 lines
3.4 KiB
Go
Raw Normal View History

2024-08-26 10:30:14 +01:00
package main
import (
2024-08-27 07:38:20 +01:00
"context"
2024-08-26 18:37:45 +01:00
"errors"
2024-08-26 10:30:14 +01:00
"fmt"
2024-08-26 19:00:44 +01:00
"io"
2024-08-27 07:38:20 +01:00
"maps"
2024-08-26 19:00:44 +01:00
"net/http"
2024-08-26 10:30:14 +01:00
"os"
2024-08-26 19:00:44 +01:00
"strings"
"time"
2024-08-26 10:30:14 +01:00
)
var (
2024-08-27 07:38:20 +01:00
errNoWebsiteProvided = errors.New("no website provided")
errTooManyArgs = errors.New("too many arguments provided")
2024-08-26 10:30:14 +01:00
)
func main() {
if err := run(); err != nil {
2024-08-27 07:38:20 +01:00
os.Stderr.WriteString("ERROR: " + err.Error() + "\n")
2024-08-26 10:30:14 +01:00
os.Exit(1)
}
}
func run() error {
2024-08-26 18:37:45 +01:00
args := os.Args[1:]
if len(args) == 0 {
2024-08-27 07:38:20 +01:00
return errNoWebsiteProvided
2024-08-26 18:37:45 +01:00
}
if len(args) > 1 {
2024-08-27 07:38:20 +01:00
return errTooManyArgs
2024-08-26 18:37:45 +01:00
}
baseURL := args[0]
2024-08-27 07:38:20 +01:00
pages := make(map[string]int)
var err error
pages, err = crawlPage(baseURL, baseURL, pages)
2024-08-26 19:00:44 +01:00
if err != nil {
2024-08-27 07:38:20 +01:00
return fmt.Errorf("received an error while crawling the website: %w", err)
2024-08-26 19:00:44 +01:00
}
2024-08-27 07:38:20 +01:00
fmt.Printf("\n\nRESULTS:\n")
for page, count := range maps.All(pages) {
fmt.Printf("%s: %d\n", page, count)
}
2024-08-26 18:37:45 +01:00
2024-08-26 10:30:14 +01:00
return nil
}
2024-08-26 19:00:44 +01:00
2024-08-27 07:38:20 +01:00
func crawlPage(rawBaseURL, rawCurrentURL string, pages map[string]int) (map[string]int, error) {
var err error
// if current URL is not on the same domain as the base URL, return the current pages.
sameDomain, err := equalDomains(rawBaseURL, rawCurrentURL)
2024-08-26 19:00:44 +01:00
if err != nil {
2024-08-27 07:38:20 +01:00
return pages, err
2024-08-26 19:00:44 +01:00
}
2024-08-27 07:38:20 +01:00
if !sameDomain {
return pages, nil
2024-08-26 19:00:44 +01:00
}
2024-08-27 07:38:20 +01:00
// get normalised version of rawCurrentURL
normalisedCurrentURL, err := normaliseURL(rawCurrentURL)
if err != nil {
return pages, err
}
// check if normalised URL has an entry in pages.
_, exists := pages[normalisedCurrentURL]
// If it has an entry, increment the count by 1 and return the pages.
if exists {
pages[normalisedCurrentURL]++
return pages, nil
}
// Create an entry for the page
pages[normalisedCurrentURL] = 1
// Get the HTML from the current URL, print that you are getting the HTML doc from current URL.
fmt.Printf("Crawling %q\n", rawCurrentURL)
htmlDoc, err := getHTML(rawCurrentURL)
if err != nil {
return pages, fmt.Errorf("error retrieving the HTML document from %q: %w", rawCurrentURL, err)
}
// Get all the URLs from the HTML doc.
links, err := getURLsFromHTML(htmlDoc, rawBaseURL)
if err != nil {
return pages, fmt.Errorf("error retrieving the links from the HTML document: %w", err)
}
// Recursively crawl each URL on the page. (add a timeout?)
for ind := range len(links) {
time.Sleep(time.Duration(1 * time.Second))
pages, err = crawlPage(rawBaseURL, links[ind], pages)
if err != nil {
fmt.Println("WARNING: error received while crawling %q: %v", links[ind], err)
}
}
return pages, nil
}
func getHTML(rawURL string) (string, error) {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(10*time.Second))
defer cancel()
request, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
if err != nil {
return "", fmt.Errorf("error creating the HTTP request: %w", err)
}
client := http.Client{}
resp, err := client.Do(request)
2024-08-26 19:00:44 +01:00
if err != nil {
return "", fmt.Errorf("error getting the response: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
return "", fmt.Errorf(
"received a bad status from %s: (%d) %s",
rawURL,
resp.StatusCode,
resp.Status,
)
}
contentType := resp.Header.Get("content-type")
if !strings.Contains(contentType, "text/html") {
return "", fmt.Errorf("unexpected content type received: want text/html, got %s", contentType)
}
data, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("error reading the data from the response: %w", err)
}
return string(data), nil
}