Increase Concurrency for Faster Crawling
This commit is contained in:
@@ -2,6 +2,7 @@ package crawler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"sitemap-api/database"
|
||||
@@ -14,17 +15,17 @@ import (
|
||||
)
|
||||
|
||||
type Crawler struct {
|
||||
db *database.DB
|
||||
maxDepth int
|
||||
visited map[string]bool
|
||||
mu sync.Mutex
|
||||
baseURL *url.URL
|
||||
client *http.Client
|
||||
eventChan chan models.Event
|
||||
uuid string
|
||||
siteID int
|
||||
currentDepth int
|
||||
totalPages int
|
||||
db *database.DB
|
||||
maxDepth int
|
||||
visited map[string]bool
|
||||
mu sync.Mutex
|
||||
baseURL *url.URL
|
||||
client *http.Client
|
||||
eventChan chan models.Event
|
||||
uuid string
|
||||
siteID int
|
||||
currentDepth int
|
||||
totalPages int
|
||||
}
|
||||
|
||||
func NewCrawler(db *database.DB) *Crawler {
|
||||
@@ -166,16 +167,16 @@ func (c *Crawler) crawlURL(urlStr string, depth int) {
|
||||
|
||||
// Crawl found links concurrently (with limited concurrency)
|
||||
var wg sync.WaitGroup
|
||||
semaphore := make(chan struct{}, 5) // Limit to 5 concurrent requests
|
||||
semaphore := make(chan struct{}, 10) // Limit to 10 concurrent requests
|
||||
|
||||
for _, link := range links {
|
||||
if depth+1 <= c.maxDepth {
|
||||
wg.Add(1)
|
||||
go func(l string) {
|
||||
defer wg.Done()
|
||||
semaphore <- struct{}{} // Acquire
|
||||
semaphore <- struct{}{} // Acquire
|
||||
c.crawlURL(l, depth+1)
|
||||
<-semaphore // Release
|
||||
<-semaphore // Release
|
||||
}(link)
|
||||
}
|
||||
}
|
||||
@@ -268,12 +269,13 @@ func (c *Crawler) calculatePriority(depth int) float64 {
|
||||
if depth == 0 {
|
||||
return 1.0
|
||||
}
|
||||
// Decrease priority with depth
|
||||
// Decrease priority with depth using clean decimals
|
||||
priority := 1.0 - (float64(depth) * 0.2)
|
||||
if priority < 0.3 {
|
||||
priority = 0.3
|
||||
}
|
||||
return priority
|
||||
// Round to 2 decimal places to avoid floating-point precision issues
|
||||
return math.Round(priority*100) / 100
|
||||
}
|
||||
|
||||
func (c *Crawler) sendEvent(eventType string, data interface{}) {
|
||||
|
||||
Reference in New Issue
Block a user