Increase Concurrency for Faster Crawling

This commit is contained in:
Kar
2026-02-05 19:57:37 +05:30
parent b80e988191
commit 228cf2f05c

View File

@@ -2,6 +2,7 @@ package crawler
import (
"fmt"
"math"
"net/http"
"net/url"
"sitemap-api/database"
@@ -14,17 +15,17 @@ import (
)
type Crawler struct {
db *database.DB
maxDepth int
visited map[string]bool
mu sync.Mutex
baseURL *url.URL
client *http.Client
eventChan chan models.Event
uuid string
siteID int
currentDepth int
totalPages int
db *database.DB
maxDepth int
visited map[string]bool
mu sync.Mutex
baseURL *url.URL
client *http.Client
eventChan chan models.Event
uuid string
siteID int
currentDepth int
totalPages int
}
func NewCrawler(db *database.DB) *Crawler {
@@ -166,16 +167,16 @@ func (c *Crawler) crawlURL(urlStr string, depth int) {
// Crawl found links concurrently (with limited concurrency)
var wg sync.WaitGroup
semaphore := make(chan struct{}, 5) // Limit to 5 concurrent requests
semaphore := make(chan struct{}, 10) // Limit to 10 concurrent requests
for _, link := range links {
if depth+1 <= c.maxDepth {
wg.Add(1)
go func(l string) {
defer wg.Done()
semaphore <- struct{}{} // Acquire
semaphore <- struct{}{} // Acquire
c.crawlURL(l, depth+1)
<-semaphore // Release
<-semaphore // Release
}(link)
}
}
@@ -268,12 +269,13 @@ func (c *Crawler) calculatePriority(depth int) float64 {
if depth == 0 {
return 1.0
}
// Decrease priority with depth
// Decrease priority with depth using clean decimals
priority := 1.0 - (float64(depth) * 0.2)
if priority < 0.3 {
priority = 0.3
}
return priority
// Round to 2 decimal places to avoid floating-point precision issues
return math.Round(priority*100) / 100
}
func (c *Crawler) sendEvent(eventType string, data interface{}) {