Increase Concurrency for Faster Crawling
This commit is contained in:
@@ -2,6 +2,7 @@ package crawler
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"sitemap-api/database"
|
"sitemap-api/database"
|
||||||
@@ -14,17 +15,17 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type Crawler struct {
|
type Crawler struct {
|
||||||
db *database.DB
|
db *database.DB
|
||||||
maxDepth int
|
maxDepth int
|
||||||
visited map[string]bool
|
visited map[string]bool
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
baseURL *url.URL
|
baseURL *url.URL
|
||||||
client *http.Client
|
client *http.Client
|
||||||
eventChan chan models.Event
|
eventChan chan models.Event
|
||||||
uuid string
|
uuid string
|
||||||
siteID int
|
siteID int
|
||||||
currentDepth int
|
currentDepth int
|
||||||
totalPages int
|
totalPages int
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewCrawler(db *database.DB) *Crawler {
|
func NewCrawler(db *database.DB) *Crawler {
|
||||||
@@ -166,16 +167,16 @@ func (c *Crawler) crawlURL(urlStr string, depth int) {
|
|||||||
|
|
||||||
// Crawl found links concurrently (with limited concurrency)
|
// Crawl found links concurrently (with limited concurrency)
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
semaphore := make(chan struct{}, 5) // Limit to 5 concurrent requests
|
semaphore := make(chan struct{}, 10) // Limit to 10 concurrent requests
|
||||||
|
|
||||||
for _, link := range links {
|
for _, link := range links {
|
||||||
if depth+1 <= c.maxDepth {
|
if depth+1 <= c.maxDepth {
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func(l string) {
|
go func(l string) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
semaphore <- struct{}{} // Acquire
|
semaphore <- struct{}{} // Acquire
|
||||||
c.crawlURL(l, depth+1)
|
c.crawlURL(l, depth+1)
|
||||||
<-semaphore // Release
|
<-semaphore // Release
|
||||||
}(link)
|
}(link)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -268,12 +269,13 @@ func (c *Crawler) calculatePriority(depth int) float64 {
|
|||||||
if depth == 0 {
|
if depth == 0 {
|
||||||
return 1.0
|
return 1.0
|
||||||
}
|
}
|
||||||
// Decrease priority with depth
|
// Decrease priority with depth using clean decimals
|
||||||
priority := 1.0 - (float64(depth) * 0.2)
|
priority := 1.0 - (float64(depth) * 0.2)
|
||||||
if priority < 0.3 {
|
if priority < 0.3 {
|
||||||
priority = 0.3
|
priority = 0.3
|
||||||
}
|
}
|
||||||
return priority
|
// Round to 2 decimal places to avoid floating-point precision issues
|
||||||
|
return math.Round(priority*100) / 100
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Crawler) sendEvent(eventType string, data interface{}) {
|
func (c *Crawler) sendEvent(eventType string, data interface{}) {
|
||||||
|
|||||||
Reference in New Issue
Block a user