Files
sitemap-generator-xml-golang/crawler.go
2026-02-05 19:13:45 +05:30

288 lines
5.8 KiB
Go

package crawler
import (
"fmt"
"net/http"
"net/url"
"sitemap-api/database"
"sitemap-api/models"
"strings"
"sync"
"time"
"golang.org/x/net/html"
)
type Crawler struct {
db *database.DB
maxDepth int
visited map[string]bool
mu sync.Mutex
baseURL *url.URL
client *http.Client
eventChan chan models.Event
uuid string
siteID int
currentDepth int
totalPages int
}
func NewCrawler(db *database.DB) *Crawler {
return &Crawler{
db: db,
client: &http.Client{
Timeout: 10 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
return nil
},
},
}
}
func (c *Crawler) Crawl(uuid string, startURL string, maxDepth int, eventChan chan models.Event) {
c.uuid = uuid
c.maxDepth = maxDepth
c.eventChan = eventChan
c.visited = make(map[string]bool)
c.totalPages = 0
// Parse base URL
parsedURL, err := url.Parse(startURL)
if err != nil {
c.sendEvent("error", models.ErrorData{
UUID: uuid,
Error: fmt.Sprintf("Invalid URL: %v", err),
})
return
}
c.baseURL = parsedURL
// Get site from database
site, err := c.db.GetSiteByUUID(uuid)
if err != nil {
c.sendEvent("error", models.ErrorData{
UUID: uuid,
Error: fmt.Sprintf("Failed to get site: %v", err),
})
return
}
c.siteID = site.ID
// Send started event
c.sendEvent("started", map[string]interface{}{
"uuid": uuid,
"url": startURL,
"max_depth": maxDepth,
})
// Start crawling from root
c.crawlURL(startURL, 0)
// Mark as completed
err = c.db.UpdateSiteStatus(uuid, "completed", c.totalPages)
if err != nil {
c.sendEvent("error", models.ErrorData{
UUID: uuid,
Error: fmt.Sprintf("Failed to update status: %v", err),
})
return
}
// Send completion event
c.sendEvent("complete", models.CompleteData{
UUID: uuid,
PagesFound: c.totalPages,
SiteID: c.siteID,
DownloadURL: fmt.Sprintf("/download/%s", uuid),
})
}
func (c *Crawler) crawlURL(urlStr string, depth int) {
// Check depth limit
if depth > c.maxDepth {
return
}
// Normalize URL
normalizedURL := c.normalizeURL(urlStr)
if normalizedURL == "" {
return
}
// Check if already visited
c.mu.Lock()
if c.visited[normalizedURL] {
c.mu.Unlock()
return
}
c.visited[normalizedURL] = true
c.totalPages++
currentTotal := c.totalPages
c.currentDepth = depth
c.mu.Unlock()
// Send progress event
c.sendEvent("progress", models.ProgressData{
UUID: c.uuid,
PagesFound: currentTotal,
Depth: depth,
CurrentURL: normalizedURL,
})
// Save page to database
priority := c.calculatePriority(depth)
page := &models.Page{
SiteID: c.siteID,
URL: normalizedURL,
Depth: depth,
LastModified: time.Now(),
Priority: priority,
ChangeFreq: "monthly",
}
if err := c.db.AddPage(page); err != nil {
// Log error but continue crawling
fmt.Printf("Failed to save page %s: %v\n", normalizedURL, err)
}
// Fetch the page
resp, err := c.client.Get(normalizedURL)
if err != nil {
return
}
defer resp.Body.Close()
// Only process HTML pages
contentType := resp.Header.Get("Content-Type")
if !strings.Contains(contentType, "text/html") {
return
}
// Parse HTML and extract links
links := c.extractLinks(resp)
// Crawl found links concurrently (with limited concurrency)
var wg sync.WaitGroup
semaphore := make(chan struct{}, 5) // Limit to 5 concurrent requests
for _, link := range links {
if depth+1 <= c.maxDepth {
wg.Add(1)
go func(l string) {
defer wg.Done()
semaphore <- struct{}{} // Acquire
c.crawlURL(l, depth+1)
<-semaphore // Release
}(link)
}
}
wg.Wait()
}
func (c *Crawler) extractLinks(resp *http.Response) []string {
var links []string
tokenizer := html.NewTokenizer(resp.Body)
for {
tokenType := tokenizer.Next()
if tokenType == html.ErrorToken {
break
}
if tokenType == html.StartTagToken {
token := tokenizer.Token()
if token.Data == "a" {
for _, attr := range token.Attr {
if attr.Key == "href" {
link := c.resolveURL(attr.Val)
if link != "" && c.isSameDomain(link) {
links = append(links, link)
}
}
}
}
}
}
return links
}
func (c *Crawler) resolveURL(href string) string {
parsedURL, err := url.Parse(href)
if err != nil {
return ""
}
// Resolve relative URLs
resolvedURL := c.baseURL.ResolveReference(parsedURL)
return resolvedURL.String()
}
func (c *Crawler) normalizeURL(urlStr string) string {
parsedURL, err := url.Parse(urlStr)
if err != nil {
return ""
}
// Remove fragment
parsedURL.Fragment = ""
// Remove trailing slash for consistency
parsedURL.Path = strings.TrimSuffix(parsedURL.Path, "/")
if parsedURL.Path == "" {
parsedURL.Path = "/"
}
return parsedURL.String()
}
func (c *Crawler) isSameDomain(urlStr string) bool {
parsedURL, err := url.Parse(urlStr)
if err != nil {
return false
}
// Check if same host
if parsedURL.Host != c.baseURL.Host {
return false
}
// Skip common non-HTML files
path := strings.ToLower(parsedURL.Path)
skipExtensions := []string{".pdf", ".jpg", ".jpeg", ".png", ".gif", ".css", ".js", ".xml", ".zip", ".tar", ".gz"}
for _, ext := range skipExtensions {
if strings.HasSuffix(path, ext) {
return false
}
}
return true
}
func (c *Crawler) calculatePriority(depth int) float64 {
// Homepage gets highest priority
if depth == 0 {
return 1.0
}
// Decrease priority with depth
priority := 1.0 - (float64(depth) * 0.2)
if priority < 0.3 {
priority = 0.3
}
return priority
}
func (c *Crawler) sendEvent(eventType string, data interface{}) {
if c.eventChan != nil {
select {
case c.eventChan <- models.Event{Type: eventType, Data: data}:
default:
// Channel full or closed, skip event
}
}
}