sitemap-generator-xml-golang/crawler.go

package crawler

import (
	"fmt"
	"net/http"
	"net/url"
	"sitemap-api/database"
	"sitemap-api/models"
	"strings"
	"sync"
	"time"

	"golang.org/x/net/html"
)

type Crawler struct {
	db            *database.DB
	maxDepth      int
	visited       map[string]bool
	mu            sync.Mutex
	baseURL       *url.URL
	client        *http.Client
	eventChan     chan models.Event
	uuid          string
	siteID        int
	currentDepth  int
	totalPages    int
}

func NewCrawler(db *database.DB) *Crawler {
	return &Crawler{
		db: db,
		client: &http.Client{
			Timeout: 10 * time.Second,
			CheckRedirect: func(req *http.Request, via []*http.Request) error {
				if len(via) >= 10 {
					return fmt.Errorf("too many redirects")
				}
				return nil
			},
		},
	}
}

func (c *Crawler) Crawl(uuid string, startURL string, maxDepth int, eventChan chan models.Event) {
	c.uuid = uuid
	c.maxDepth = maxDepth
	c.eventChan = eventChan
	c.visited = make(map[string]bool)
	c.totalPages = 0

	// Parse base URL
	parsedURL, err := url.Parse(startURL)
	if err != nil {
		c.sendEvent("error", models.ErrorData{
			UUID:  uuid,
			Error: fmt.Sprintf("Invalid URL: %v", err),
		})
		return
	}
	c.baseURL = parsedURL

	// Get site from database
	site, err := c.db.GetSiteByUUID(uuid)
	if err != nil {
		c.sendEvent("error", models.ErrorData{
			UUID:  uuid,
			Error: fmt.Sprintf("Failed to get site: %v", err),
		})
		return
	}
	c.siteID = site.ID

	// Send started event
	c.sendEvent("started", map[string]interface{}{
		"uuid":      uuid,
		"url":       startURL,
		"max_depth": maxDepth,
	})

	// Start crawling from root
	c.crawlURL(startURL, 0)

	// Mark as completed
	err = c.db.UpdateSiteStatus(uuid, "completed", c.totalPages)
	if err != nil {
		c.sendEvent("error", models.ErrorData{
			UUID:  uuid,
			Error: fmt.Sprintf("Failed to update status: %v", err),
		})
		return
	}

	// Send completion event
	c.sendEvent("complete", models.CompleteData{
		UUID:        uuid,
		PagesFound:  c.totalPages,
		SiteID:      c.siteID,
		DownloadURL: fmt.Sprintf("/download/%s", uuid),
	})
}

func (c *Crawler) crawlURL(urlStr string, depth int) {
	// Check depth limit
	if depth > c.maxDepth {
		return
	}

	// Normalize URL
	normalizedURL := c.normalizeURL(urlStr)
	if normalizedURL == "" {
		return
	}

	// Check if already visited
	c.mu.Lock()
	if c.visited[normalizedURL] {
		c.mu.Unlock()
		return
	}
	c.visited[normalizedURL] = true
	c.totalPages++
	currentTotal := c.totalPages
	c.currentDepth = depth
	c.mu.Unlock()

	// Send progress event
	c.sendEvent("progress", models.ProgressData{
		UUID:       c.uuid,
		PagesFound: currentTotal,
		Depth:      depth,
		CurrentURL: normalizedURL,
	})

	// Save page to database
	priority := c.calculatePriority(depth)
	page := &models.Page{
		SiteID:       c.siteID,
		URL:          normalizedURL,
		Depth:        depth,
		LastModified: time.Now(),
		Priority:     priority,
		ChangeFreq:   "monthly",
	}

	if err := c.db.AddPage(page); err != nil {
		// Log error but continue crawling
		fmt.Printf("Failed to save page %s: %v\n", normalizedURL, err)
	}

	// Fetch the page
	resp, err := c.client.Get(normalizedURL)
	if err != nil {
		return
	}
	defer resp.Body.Close()

	// Only process HTML pages
	contentType := resp.Header.Get("Content-Type")
	if !strings.Contains(contentType, "text/html") {
		return
	}

	// Parse HTML and extract links
	links := c.extractLinks(resp)

	// Crawl found links concurrently (with limited concurrency)
	var wg sync.WaitGroup
	semaphore := make(chan struct{}, 5) // Limit to 5 concurrent requests

	for _, link := range links {
		if depth+1 <= c.maxDepth {
			wg.Add(1)
			go func(l string) {
				defer wg.Done()
				semaphore <- struct{}{}        // Acquire
				c.crawlURL(l, depth+1)
				<-semaphore                    // Release
			}(link)
		}
	}

	wg.Wait()
}

func (c *Crawler) extractLinks(resp *http.Response) []string {
	var links []string
	tokenizer := html.NewTokenizer(resp.Body)

	for {
		tokenType := tokenizer.Next()
		if tokenType == html.ErrorToken {
			break
		}

		if tokenType == html.StartTagToken {
			token := tokenizer.Token()
			if token.Data == "a" {
				for _, attr := range token.Attr {
					if attr.Key == "href" {
						link := c.resolveURL(attr.Val)
						if link != "" && c.isSameDomain(link) {
							links = append(links, link)
						}
					}
				}
			}
		}
	}

	return links
}

func (c *Crawler) resolveURL(href string) string {
	parsedURL, err := url.Parse(href)
	if err != nil {
		return ""
	}

	// Resolve relative URLs
	resolvedURL := c.baseURL.ResolveReference(parsedURL)
	return resolvedURL.String()
}

func (c *Crawler) normalizeURL(urlStr string) string {
	parsedURL, err := url.Parse(urlStr)
	if err != nil {
		return ""
	}

	// Remove fragment
	parsedURL.Fragment = ""

	// Remove trailing slash for consistency
	parsedURL.Path = strings.TrimSuffix(parsedURL.Path, "/")
	if parsedURL.Path == "" {
		parsedURL.Path = "/"
	}

	return parsedURL.String()
}

func (c *Crawler) isSameDomain(urlStr string) bool {
	parsedURL, err := url.Parse(urlStr)
	if err != nil {
		return false
	}

	// Check if same host
	if parsedURL.Host != c.baseURL.Host {
		return false
	}

	// Skip common non-HTML files
	path := strings.ToLower(parsedURL.Path)
	skipExtensions := []string{".pdf", ".jpg", ".jpeg", ".png", ".gif", ".css", ".js", ".xml", ".zip", ".tar", ".gz"}
	for _, ext := range skipExtensions {
		if strings.HasSuffix(path, ext) {
			return false
		}
	}

	return true
}

func (c *Crawler) calculatePriority(depth int) float64 {
	// Homepage gets highest priority
	if depth == 0 {
		return 1.0
	}
	// Decrease priority with depth
	priority := 1.0 - (float64(depth) * 0.2)
	if priority < 0.3 {
		priority = 0.3
	}
	return priority
}

func (c *Crawler) sendEvent(eventType string, data interface{}) {
	if c.eventChan != nil {
		select {
		case c.eventChan <- models.Event{Type: eventType, Data: data}:
		default:
			// Channel full or closed, skip event
		}
	}
}