package crawler import ( "fmt" "math" "net/http" "net/url" "sitemap-api/database" "sitemap-api/models" "strings" "sync" "time" "golang.org/x/net/html" ) type Crawler struct { db *database.DB maxDepth int visited map[string]bool mu sync.Mutex baseURL *url.URL client *http.Client eventChan chan models.Event uuid string siteID int currentDepth int totalPages int } func NewCrawler(db *database.DB) *Crawler { return &Crawler{ db: db, client: &http.Client{ Timeout: 10 * time.Second, CheckRedirect: func(req *http.Request, via []*http.Request) error { if len(via) >= 10 { return fmt.Errorf("too many redirects") } return nil }, }, } } func (c *Crawler) Crawl(uuid string, startURL string, maxDepth int, eventChan chan models.Event) { c.uuid = uuid c.maxDepth = maxDepth c.eventChan = eventChan c.visited = make(map[string]bool) c.totalPages = 0 // Parse base URL parsedURL, err := url.Parse(startURL) if err != nil { c.sendEvent("error", models.ErrorData{ UUID: uuid, Error: fmt.Sprintf("Invalid URL: %v", err), }) return } c.baseURL = parsedURL // Get site from database site, err := c.db.GetSiteByUUID(uuid) if err != nil { c.sendEvent("error", models.ErrorData{ UUID: uuid, Error: fmt.Sprintf("Failed to get site: %v", err), }) return } c.siteID = site.ID // Send started event c.sendEvent("started", map[string]interface{}{ "uuid": uuid, "url": startURL, "max_depth": maxDepth, }) // Start crawling from root c.crawlURL(startURL, 0) // Mark as completed err = c.db.UpdateSiteStatus(uuid, "completed", c.totalPages) if err != nil { c.sendEvent("error", models.ErrorData{ UUID: uuid, Error: fmt.Sprintf("Failed to update status: %v", err), }) return } // Send completion event c.sendEvent("complete", models.CompleteData{ UUID: uuid, PagesFound: c.totalPages, SiteID: c.siteID, DownloadURL: fmt.Sprintf("/download/%s", uuid), }) } func (c *Crawler) crawlURL(urlStr string, depth int) { // Check depth limit if depth > c.maxDepth { return } // Normalize URL normalizedURL := c.normalizeURL(urlStr) if normalizedURL == "" { return } // Check if already visited c.mu.Lock() if c.visited[normalizedURL] { c.mu.Unlock() return } c.visited[normalizedURL] = true c.totalPages++ currentTotal := c.totalPages c.currentDepth = depth c.mu.Unlock() // Send progress event c.sendEvent("progress", models.ProgressData{ UUID: c.uuid, PagesFound: currentTotal, Depth: depth, CurrentURL: normalizedURL, }) // Save page to database priority := c.calculatePriority(depth) page := &models.Page{ SiteID: c.siteID, URL: normalizedURL, Depth: depth, LastModified: time.Now(), Priority: priority, ChangeFreq: "monthly", } if err := c.db.AddPage(page); err != nil { // Log error but continue crawling fmt.Printf("Failed to save page %s: %v\n", normalizedURL, err) } // Fetch the page resp, err := c.client.Get(normalizedURL) if err != nil { return } defer resp.Body.Close() // Only process HTML pages contentType := resp.Header.Get("Content-Type") if !strings.Contains(contentType, "text/html") { return } // Parse HTML and extract links links := c.extractLinks(resp) // Crawl found links concurrently (with limited concurrency) var wg sync.WaitGroup semaphore := make(chan struct{}, 10) // Limit to 10 concurrent requests for _, link := range links { if depth+1 <= c.maxDepth { wg.Add(1) go func(l string) { defer wg.Done() semaphore <- struct{}{} // Acquire c.crawlURL(l, depth+1) <-semaphore // Release }(link) } } wg.Wait() } func (c *Crawler) extractLinks(resp *http.Response) []string { var links []string tokenizer := html.NewTokenizer(resp.Body) for { tokenType := tokenizer.Next() if tokenType == html.ErrorToken { break } if tokenType == html.StartTagToken { token := tokenizer.Token() if token.Data == "a" { for _, attr := range token.Attr { if attr.Key == "href" { link := c.resolveURL(attr.Val) if link != "" && c.isSameDomain(link) { links = append(links, link) } } } } } } return links } func (c *Crawler) resolveURL(href string) string { parsedURL, err := url.Parse(href) if err != nil { return "" } // Resolve relative URLs resolvedURL := c.baseURL.ResolveReference(parsedURL) return resolvedURL.String() } func (c *Crawler) normalizeURL(urlStr string) string { parsedURL, err := url.Parse(urlStr) if err != nil { return "" } // Remove fragment parsedURL.Fragment = "" // Remove trailing slash for consistency parsedURL.Path = strings.TrimSuffix(parsedURL.Path, "/") if parsedURL.Path == "" { parsedURL.Path = "/" } return parsedURL.String() } func (c *Crawler) isSameDomain(urlStr string) bool { parsedURL, err := url.Parse(urlStr) if err != nil { return false } // Check if same host if parsedURL.Host != c.baseURL.Host { return false } // Skip common non-HTML files path := strings.ToLower(parsedURL.Path) skipExtensions := []string{".pdf", ".jpg", ".jpeg", ".png", ".gif", ".css", ".js", ".xml", ".zip", ".tar", ".gz"} for _, ext := range skipExtensions { if strings.HasSuffix(path, ext) { return false } } return true } func (c *Crawler) calculatePriority(depth int) float64 { // Homepage gets highest priority if depth == 0 { return 1.0 } // Decrease priority with depth using clean decimals priority := 1.0 - (float64(depth) * 0.2) if priority < 0.3 { priority = 0.3 } // Round to 2 decimal places to avoid floating-point precision issues return math.Round(priority*100) / 100 } func (c *Crawler) sendEvent(eventType string, data interface{}) { if c.eventChan != nil { select { case c.eventChan <- models.Event{Type: eventType, Data: data}: default: // Channel full or closed, skip event } } }